Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,18 @@ export interface AdaptivePlaywrightCrawlerOptions
*/
resultChecker?: (result: RequestHandlerResult) => boolean;

/**
* An optional callback that decides whether an error thrown during the plain HTTP request handler
* should be propagated (instead of falling back to browser navigation).
*
* If the callback returns `true`, the error is thrown, triggering the standard retry mechanism.
* If the callback returns `false` (or is not provided), the error is logged and the crawler
* falls back to browser navigation (default behavior).
*
* @default () => false
*/
shouldPropagateError?: (error: Error, context: PlaywrightCrawlingContext) => Awaitable<boolean>;

/**
* An optional callback used in rendering type detection. On each detection, the result of the plain HTTP run is compared to that of the browser one.
* If a callback is provided, the contract is as follows:
Expand Down Expand Up @@ -267,6 +279,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
private adaptiveRequestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] & {};
private renderingTypePredictor: NonNullable<AdaptivePlaywrightCrawlerOptions['renderingTypePredictor']>;
private resultChecker: NonNullable<AdaptivePlaywrightCrawlerOptions['resultChecker']>;
private shouldPropagateError: NonNullable<AdaptivePlaywrightCrawlerOptions['shouldPropagateError']>;
private resultComparator: NonNullable<AdaptivePlaywrightCrawlerOptions['resultComparator']>;
private preventDirectStorageAccess: boolean;
declare readonly stats: AdaptivePlaywrightCrawlerStatistics;
Expand All @@ -289,6 +302,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
renderingTypeDetectionRatio = 0.1,
renderingTypePredictor,
resultChecker,
shouldPropagateError,
resultComparator,
statisticsOptions,
preventDirectStorageAccess = true,
Expand All @@ -300,6 +314,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
this.renderingTypePredictor =
renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio });
this.resultChecker = resultChecker ?? (() => true);
this.shouldPropagateError = shouldPropagateError ?? (() => false);

if (resultComparator !== undefined) {
this.resultComparator = resultComparator;
Expand Down Expand Up @@ -366,8 +381,14 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
return;
}
if (!plainHTTPRun.ok) {
const error = plainHTTPRun.error as Error;

if (await this.shouldPropagateError(error, crawlingContext)) {
throw error;
}

crawlingContext.log.exception(
plainHTTPRun.error as Error,
error,
`HTTP-only request handler failed for ${crawlingContext.request.url}`,
);
} else {
Expand Down
60 changes: 60 additions & 0 deletions test/core/crawlers/adaptive_playwright_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,66 @@ describe('AdaptivePlaywrightCrawler', () => {
expect(resultChecker).toHaveBeenCalledTimes(1);
});

describe('shouldPropagateError', () => {
const renderingTypePredictor = makeRiggedRenderingTypePredictor({
detectionProbabilityRecommendation: 0,
renderingType: 'static',
});
const failedRequestHandler = vi.fn();
const testError = new Error('HTTP handler failed');
const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async () => {
throw testError;
});

beforeEach(() => {
vi.clearAllMocks();
});

test('should fall back to browser when shouldPropagateError returns false', async () => {
const shouldPropagateError = vi.fn(() => false);
const url = new URL(`http://${HOSTNAME}:${port}/static`);

const crawler = await makeOneshotCrawler(
{
requestHandler,
renderingTypePredictor,
shouldPropagateError,
failedRequestHandler,
},
[url.toString()],
);

await crawler.run();

expect(shouldPropagateError).toHaveBeenCalledOnce();
expect(shouldPropagateError).toHaveBeenCalledWith(testError, expect.anything());
expect(requestHandler).toHaveBeenCalledTimes(2);
});

test('should propagate error when shouldPropagateError returns true', async () => {
const shouldPropagateError = vi.fn(() => true);
const url = new URL(`http://${HOSTNAME}:${port}/static`);

const crawler = await makeOneshotCrawler(
{
requestHandler,
renderingTypePredictor,
shouldPropagateError,
failedRequestHandler,
},
[url.toString()],
);

await crawler.run();

expect(shouldPropagateError).toHaveBeenCalledOnce();
expect(shouldPropagateError).toHaveBeenCalledWith(testError, expect.anything());
expect(requestHandler).toHaveBeenCalledTimes(1);
expect(failedRequestHandler).toHaveBeenCalledOnce();
expect(failedRequestHandler.mock.calls[0][1]).toBe(testError);
});
});

test.each([
['static'],
['clientOnly'],
Expand Down
Loading