Skip to content

Commit

Permalink
feat: browserPerProxy browser launch option (#2418)
Browse files Browse the repository at this point in the history
Fixes the performance issues with the new proxy handling in browser
crawlers reported by @AndreyBykov 's team.

Reduces the proxy antiblocking performance, though. Consider the
following snippet:

```typescript
  const proxyConfiguration = new ProxyConfiguration({
    newUrlFunction: async () => {
      return `http://session-${Math.random().toString().slice(2,6)}:[email protected]:8000`;
    }
  })

  const crawler = new PuppeteerCrawler({
    proxyConfiguration,
    requestHandler: async ({ response, proxyInfo }) => {
      console.log((await response?.json()).ip);
    },
    headless: false,
    // browser per proxy = `false` by default
  });
  
  await crawler.run([
    'https://api.ipify.org/?format=json&q=qnom',
    'https://api.ipify.org/?format=json&q=bugt',
    'https://api.ipify.org/?format=json&q=qfju',
    'https://api.ipify.org/?format=json&q=utbb',
    'https://api.ipify.org/?format=json&q=ekqu',
  ]);

```

```
INFO  System info {"apifyVersion":"3.1.16","apifyClientVersion":"2.9.3","crawleeVersion":"3.9.0","osType":"Linux","nodeVersion":"v20.2.0"}
INFO  PuppeteerCrawler: Starting the crawler.
139.28.120.90
139.28.120.90
139.28.120.90
139.28.120.90
139.28.120.90
INFO  PuppeteerCrawler: All requests from the queue have been processed, the crawler will shut down.
INFO  PuppeteerCrawler: Final request statistics: {"requestsFinished":5,"requestsFailed":0,"retryHistogram":[5],"requestAvgFailedDurationMillis":null,"requestAvgFinishedDurationMillis":1189,"requestsFinishedPerMinute":86,"requestsFailedPerMinute":0,"requestTotalDurationMillis":5946,"requestsTotal":5,"crawlerRuntimeMillis":3489}
INFO  PuppeteerCrawler: Finished! Total 5 requests: 5 succeeded, 0 failed. {"terminal":true}

real    0m6,358s
user    0m6,097s
sys     0m0,929s
```

-------

With `browserPerProxy` enabled, the same code snippet runs twice as
slow... but correct.
```diff
  const proxyConfiguration = new ProxyConfiguration({
    newUrlFunction: async () => {
      return `http://session-${Math.random().toString().slice(2,6)}:[email protected]:8000`;
    }
  })

  const crawler = new PuppeteerCrawler({
    proxyConfiguration,
    requestHandler: async ({ response, proxyInfo }) => {
      console.log((await response?.json()).ip);
    },
    headless: false,
+    launchContext: {
+      browserPerProxy: true,
+    }
  });
  
  await crawler.run([
    'https://api.ipify.org/?format=json&q=qnom',
    'https://api.ipify.org/?format=json&q=bugt',
    'https://api.ipify.org/?format=json&q=qfju',
    'https://api.ipify.org/?format=json&q=utbb',
    'https://api.ipify.org/?format=json&q=ekqu',
  ]);
```
```
INFO  PuppeteerCrawler: Starting the crawler.
119.13.197.92
43.228.238.111
107.175.80.114
104.165.1.67
192.3.93.50
INFO  PuppeteerCrawler: All requests from the queue have been processed, the crawler will shut down.
INFO  PuppeteerCrawler: Final request statistics: {"requestsFinished":5,"requestsFailed":0,"retryHistogram":[5],"requestAvgFailedDurationMillis":null,"requestAvgFinishedDurationMillis":2263,"requestsFinishedPerMinute":34,"requestsFailedPerMinute":0,"requestTotalDurationMillis":11317,"requestsTotal":5,"crawlerRuntimeMillis":8765}
INFO  PuppeteerCrawler: Finished! Total 5 requests: 5 succeeded, 0 failed. {"terminal":true}

real    0m11,610s
user    0m12,990s
sys     0m3,295s
```

---------

Co-authored-by: Martin Adámek <[email protected]>
  • Loading branch information
barjin and B4nan authored Apr 11, 2024
1 parent 0cd8017 commit df57b29
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 3 deletions.
9 changes: 9 additions & 0 deletions packages/browser-crawler/src/internals/browser-launcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ export interface BrowserLaunchContext<TOptions, Launcher> extends BrowserPluginO
*/
useChrome?: boolean;

/**
* If set to `true`, the crawler respects the proxy url generated for the given request.
* This aligns the browser-based crawlers with the `HttpCrawler`.
*
* Might cause performance issues, as Crawlee might launch too many browser instances.
*/
browserPerProxy?: boolean;

/**
* With this option selected, all pages will be opened in a new incognito browser context.
* This means they will not share cookies nor cache and their resources will not be throttled by one another.
Expand Down Expand Up @@ -98,6 +106,7 @@ export abstract class BrowserLauncher<
proxyUrl: ow.optional.string.url,
useChrome: ow.optional.boolean,
useIncognitoPages: ow.optional.boolean,
browserPerProxy: ow.optional.boolean,
experimentalContainers: ow.optional.boolean,
userDataDir: ow.optional.string,
launchOptions: ow.optional.object,
Expand Down
13 changes: 13 additions & 0 deletions packages/browser-pool/src/abstract-classes/browser-plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ export interface BrowserPluginOptions<LibraryOptions> {
* Path to a User Data Directory, which stores browser session data like cookies and local storage.
*/
userDataDir?: string;
/**
* If set to `true`, the crawler respects the proxy url generated for the given request.
* This aligns the browser-based crawlers with the `HttpCrawler`.
*
* Might cause performance issues, as Crawlee might launch too many browser instances.
*/
browserPerProxy?: boolean;
}

export interface CreateLaunchContextOptions<
Expand Down Expand Up @@ -112,13 +119,16 @@ export abstract class BrowserPlugin<

experimentalContainers: boolean;

browserPerProxy?: boolean;

constructor(library: Library, options: BrowserPluginOptions<LibraryOptions> = {}) {
const {
launchOptions = {} as LibraryOptions,
proxyUrl,
userDataDir,
useIncognitoPages = false,
experimentalContainers = false,
browserPerProxy = false,
} = options;

this.library = library;
Expand All @@ -127,6 +137,7 @@ export abstract class BrowserPlugin<
this.userDataDir = userDataDir;
this.useIncognitoPages = useIncognitoPages;
this.experimentalContainers = experimentalContainers;
this.browserPerProxy = browserPerProxy;
}

/**
Expand All @@ -145,6 +156,7 @@ export abstract class BrowserPlugin<
useIncognitoPages = this.useIncognitoPages,
userDataDir = this.userDataDir,
experimentalContainers = this.experimentalContainers,
browserPerProxy = this.browserPerProxy,
proxyTier,
} = options;

Expand All @@ -156,6 +168,7 @@ export abstract class BrowserPlugin<
useIncognitoPages,
experimentalContainers,
userDataDir,
browserPerProxy,
proxyTier,
});
}
Expand Down
33 changes: 31 additions & 2 deletions packages/browser-pool/src/browser-pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ export interface BrowserPoolOptions<Plugin extends BrowserPlugin = BrowserPlugin
* @default 300
*/
closeInactiveBrowserAfterSecs?: number;
/**
* Browsers are marked as retired after they have been inactive for a certain
* amount of time. This option sets the interval at which the browsers
* are checked and retired if they are inactive.
*
* Retired browsers are closed after all their pages are closed.
*
* @default 1
*/
retireInactiveBrowserAfterSecs?: number;
/**
* @default true
*/
Expand Down Expand Up @@ -306,6 +316,8 @@ export class BrowserPool<
BROWSER_KILLER_INTERVAL_MILLIS,
);

private browserRetireInterval?: NodeJS.Timeout;

private limiter = pLimit(1);

constructor(options: Options & BrowserPoolHooks<BrowserControllerReturn, LaunchContextReturn, PageReturn>) {
Expand All @@ -319,6 +331,7 @@ export class BrowserPool<
retireBrowserAfterPageCount: ow.optional.number,
operationTimeoutSecs: ow.optional.number,
closeInactiveBrowserAfterSecs: ow.optional.number,
retireInactiveBrowserAfterSecs: ow.optional.number,
preLaunchHooks: ow.optional.array,
postLaunchHooks: ow.optional.array,
prePageCreateHooks: ow.optional.array,
Expand All @@ -335,6 +348,7 @@ export class BrowserPool<
retireBrowserAfterPageCount = 100,
operationTimeoutSecs = 15,
closeInactiveBrowserAfterSecs = 300,
retireInactiveBrowserAfterSecs = 1,
preLaunchHooks = [],
postLaunchHooks = [],
prePageCreateHooks = [],
Expand Down Expand Up @@ -367,6 +381,18 @@ export class BrowserPool<
this.useFingerprints = useFingerprints;
this.fingerprintOptions = fingerprintOptions;

this.browserRetireInterval = setInterval(
async () => this.activeBrowserControllers.forEach((controller) => {
if (
controller.activePages === 0
&& controller.lastPageOpenedAt < (Date.now() - retireInactiveBrowserAfterSecs * 1000)
) {
this.retireBrowserController(controller);
}
}), retireInactiveBrowserAfterSecs * 1000);

this.browserRetireInterval!.unref();

// hooks
this.preLaunchHooks = preLaunchHooks;
this.postLaunchHooks = postLaunchHooks;
Expand Down Expand Up @@ -613,7 +639,9 @@ export class BrowserPool<
*/
async destroy(): Promise<void> {
clearInterval(this.browserKillerInterval!);
clearInterval(this.browserRetireInterval!);
this.browserKillerInterval = undefined;
this.browserRetireInterval = undefined;

await this.closeAllBrowsers();

Expand Down Expand Up @@ -706,13 +734,14 @@ export class BrowserPool<
return [...this.activeBrowserControllers].find((controller) => {
const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser;
const isCorrectPlugin = controller.browserPlugin === browserPlugin;
const isSameProxyUrl = (controller.proxyUrl === options?.proxyUrl);
const isSameProxyUrl = controller.proxyUrl === options?.proxyUrl;
const isCorrectProxyTier = controller.proxyTier === options?.proxyTier;

return isCorrectPlugin
&& hasCapacity
&& (
(options?.proxyTier && isCorrectProxyTier)
(!controller.launchContext.browserPerProxy && !options?.proxyTier)
|| (options?.proxyTier && isCorrectProxyTier)
|| (options?.proxyUrl && isSameProxyUrl)
|| (!options?.proxyUrl && !options?.proxyTier && !controller.proxyUrl && !controller.proxyTier)
);
Expand Down
10 changes: 10 additions & 0 deletions packages/browser-pool/src/launch-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ export interface LaunchContextOptions<
* Those changes would be typically made in pre-launch hooks.
*/
launchOptions: LibraryOptions;
/**
* If set to `true`, the crawler respects the proxy url generated for the given request.
* This aligns the browser-based crawlers with the `HttpCrawler`.
*
* Might cause performance issues, as Crawlee might launch too many browser instances.
*/
browserPerProxy?: boolean;
/**
* By default pages share the same browser context.
* If set to `true` each page uses its own context that is destroyed once the page is closed or crashes.
Expand Down Expand Up @@ -64,6 +71,7 @@ export class LaunchContext<
browserPlugin: BrowserPlugin<Library, LibraryOptions, LaunchResult, NewPageOptions, NewPageResult>;
launchOptions: LibraryOptions;
useIncognitoPages: boolean;
browserPerProxy?: boolean;
experimentalContainers: boolean;
userDataDir: string;
proxyTier?: number;
Expand All @@ -81,6 +89,7 @@ export class LaunchContext<
launchOptions,
proxyUrl,
useIncognitoPages,
browserPerProxy,
experimentalContainers,
userDataDir = '',
proxyTier,
Expand All @@ -89,6 +98,7 @@ export class LaunchContext<
this.id = id;
this.browserPlugin = browserPlugin;
this.launchOptions = launchOptions;
this.browserPerProxy = browserPerProxy ?? false;
this.useIncognitoPages = useIncognitoPages ?? false;
this.experimentalContainers = experimentalContainers ?? false;
this.userDataDir = userDataDir;
Expand Down
7 changes: 6 additions & 1 deletion packages/browser-pool/test/multiple-plugins.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { BrowserPool, PlaywrightPlugin } from '@crawlee/browser-pool';
import playwright from 'playwright';

describe('BrowserPool - Using multiple plugins', () => {
let browserPool: BrowserPool<{ browserPlugins: [PlaywrightPlugin, PlaywrightPlugin]; closeInactiveBrowserAfterSecs: 2 }>;
let browserPool: BrowserPool<{
browserPlugins: [PlaywrightPlugin, PlaywrightPlugin];
closeInactiveBrowserAfterSecs: 2;
retireInactiveBrowserAfterSecs: 30;
}>;
const chromePlugin = new PlaywrightPlugin(playwright.chromium);
const firefoxPlugin = new PlaywrightPlugin(playwright.firefox);

Expand All @@ -14,6 +18,7 @@ describe('BrowserPool - Using multiple plugins', () => {
firefoxPlugin,
],
closeInactiveBrowserAfterSecs: 2,
retireInactiveBrowserAfterSecs: 30,
});
});

Expand Down

0 comments on commit df57b29

Please sign in to comment.