diff --git a/packages/browser-crawler/src/internals/browser-launcher.ts b/packages/browser-crawler/src/internals/browser-launcher.ts index 3ec11a1c6454..22bb37b4df0f 100644 --- a/packages/browser-crawler/src/internals/browser-launcher.ts +++ b/packages/browser-crawler/src/internals/browser-launcher.ts @@ -31,6 +31,14 @@ export interface BrowserLaunchContext extends BrowserPluginO */ useChrome?: boolean; + /** + * If set to `true`, the crawler respects the proxy url generated for the given request. + * This aligns the browser-based crawlers with the `HttpCrawler`. + * + * Might cause performance issues, as Crawlee might launch too many browser instances. + */ + browserPerProxy?: boolean; + /** * With this option selected, all pages will be opened in a new incognito browser context. * This means they will not share cookies nor cache and their resources will not be throttled by one another. @@ -98,6 +106,7 @@ export abstract class BrowserLauncher< proxyUrl: ow.optional.string.url, useChrome: ow.optional.boolean, useIncognitoPages: ow.optional.boolean, + browserPerProxy: ow.optional.boolean, experimentalContainers: ow.optional.boolean, userDataDir: ow.optional.string, launchOptions: ow.optional.object, diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index 25a040410692..901a5ad232bb 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -75,6 +75,13 @@ export interface BrowserPluginOptions { * Path to a User Data Directory, which stores browser session data like cookies and local storage. */ userDataDir?: string; + /** + * If set to `true`, the crawler respects the proxy url generated for the given request. + * This aligns the browser-based crawlers with the `HttpCrawler`. + * + * Might cause performance issues, as Crawlee might launch too many browser instances. + */ + browserPerProxy?: boolean; } export interface CreateLaunchContextOptions< @@ -112,6 +119,8 @@ export abstract class BrowserPlugin< experimentalContainers: boolean; + browserPerProxy?: boolean; + constructor(library: Library, options: BrowserPluginOptions = {}) { const { launchOptions = {} as LibraryOptions, @@ -119,6 +128,7 @@ export abstract class BrowserPlugin< userDataDir, useIncognitoPages = false, experimentalContainers = false, + browserPerProxy = false, } = options; this.library = library; @@ -127,6 +137,7 @@ export abstract class BrowserPlugin< this.userDataDir = userDataDir; this.useIncognitoPages = useIncognitoPages; this.experimentalContainers = experimentalContainers; + this.browserPerProxy = browserPerProxy; } /** @@ -145,6 +156,7 @@ export abstract class BrowserPlugin< useIncognitoPages = this.useIncognitoPages, userDataDir = this.userDataDir, experimentalContainers = this.experimentalContainers, + browserPerProxy = this.browserPerProxy, proxyTier, } = options; @@ -156,6 +168,7 @@ export abstract class BrowserPlugin< useIncognitoPages, experimentalContainers, userDataDir, + browserPerProxy, proxyTier, }); } diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 9171fb022682..d2908c282d7a 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -94,6 +94,16 @@ export interface BrowserPoolOptions) { @@ -319,6 +331,7 @@ export class BrowserPool< retireBrowserAfterPageCount: ow.optional.number, operationTimeoutSecs: ow.optional.number, closeInactiveBrowserAfterSecs: ow.optional.number, + retireInactiveBrowserAfterSecs: ow.optional.number, preLaunchHooks: ow.optional.array, postLaunchHooks: ow.optional.array, prePageCreateHooks: ow.optional.array, @@ -335,6 +348,7 @@ export class BrowserPool< retireBrowserAfterPageCount = 100, operationTimeoutSecs = 15, closeInactiveBrowserAfterSecs = 300, + retireInactiveBrowserAfterSecs = 1, preLaunchHooks = [], postLaunchHooks = [], prePageCreateHooks = [], @@ -367,6 +381,18 @@ export class BrowserPool< this.useFingerprints = useFingerprints; this.fingerprintOptions = fingerprintOptions; + this.browserRetireInterval = setInterval( + async () => this.activeBrowserControllers.forEach((controller) => { + if ( + controller.activePages === 0 + && controller.lastPageOpenedAt < (Date.now() - retireInactiveBrowserAfterSecs * 1000) + ) { + this.retireBrowserController(controller); + } + }), retireInactiveBrowserAfterSecs * 1000); + + this.browserRetireInterval!.unref(); + // hooks this.preLaunchHooks = preLaunchHooks; this.postLaunchHooks = postLaunchHooks; @@ -613,7 +639,9 @@ export class BrowserPool< */ async destroy(): Promise { clearInterval(this.browserKillerInterval!); + clearInterval(this.browserRetireInterval!); this.browserKillerInterval = undefined; + this.browserRetireInterval = undefined; await this.closeAllBrowsers(); @@ -706,13 +734,14 @@ export class BrowserPool< return [...this.activeBrowserControllers].find((controller) => { const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser; const isCorrectPlugin = controller.browserPlugin === browserPlugin; - const isSameProxyUrl = (controller.proxyUrl === options?.proxyUrl); + const isSameProxyUrl = controller.proxyUrl === options?.proxyUrl; const isCorrectProxyTier = controller.proxyTier === options?.proxyTier; return isCorrectPlugin && hasCapacity && ( - (options?.proxyTier && isCorrectProxyTier) + (!controller.launchContext.browserPerProxy && !options?.proxyTier) + || (options?.proxyTier && isCorrectProxyTier) || (options?.proxyUrl && isSameProxyUrl) || (!options?.proxyUrl && !options?.proxyTier && !controller.proxyUrl && !controller.proxyTier) ); diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index a89d31edf29f..e7cbdfbb4aab 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -34,6 +34,13 @@ export interface LaunchContextOptions< * Those changes would be typically made in pre-launch hooks. */ launchOptions: LibraryOptions; + /** + * If set to `true`, the crawler respects the proxy url generated for the given request. + * This aligns the browser-based crawlers with the `HttpCrawler`. + * + * Might cause performance issues, as Crawlee might launch too many browser instances. + */ + browserPerProxy?: boolean; /** * By default pages share the same browser context. * If set to `true` each page uses its own context that is destroyed once the page is closed or crashes. @@ -64,6 +71,7 @@ export class LaunchContext< browserPlugin: BrowserPlugin; launchOptions: LibraryOptions; useIncognitoPages: boolean; + browserPerProxy?: boolean; experimentalContainers: boolean; userDataDir: string; proxyTier?: number; @@ -81,6 +89,7 @@ export class LaunchContext< launchOptions, proxyUrl, useIncognitoPages, + browserPerProxy, experimentalContainers, userDataDir = '', proxyTier, @@ -89,6 +98,7 @@ export class LaunchContext< this.id = id; this.browserPlugin = browserPlugin; this.launchOptions = launchOptions; + this.browserPerProxy = browserPerProxy ?? false; this.useIncognitoPages = useIncognitoPages ?? false; this.experimentalContainers = experimentalContainers ?? false; this.userDataDir = userDataDir; diff --git a/packages/browser-pool/test/multiple-plugins.test.ts b/packages/browser-pool/test/multiple-plugins.test.ts index 6390c0a390e3..25db43d39acd 100644 --- a/packages/browser-pool/test/multiple-plugins.test.ts +++ b/packages/browser-pool/test/multiple-plugins.test.ts @@ -2,7 +2,11 @@ import { BrowserPool, PlaywrightPlugin } from '@crawlee/browser-pool'; import playwright from 'playwright'; describe('BrowserPool - Using multiple plugins', () => { - let browserPool: BrowserPool<{ browserPlugins: [PlaywrightPlugin, PlaywrightPlugin]; closeInactiveBrowserAfterSecs: 2 }>; + let browserPool: BrowserPool<{ + browserPlugins: [PlaywrightPlugin, PlaywrightPlugin]; + closeInactiveBrowserAfterSecs: 2; + retireInactiveBrowserAfterSecs: 30; + }>; const chromePlugin = new PlaywrightPlugin(playwright.chromium); const firefoxPlugin = new PlaywrightPlugin(playwright.firefox); @@ -14,6 +18,7 @@ describe('BrowserPool - Using multiple plugins', () => { firefoxPlugin, ], closeInactiveBrowserAfterSecs: 2, + retireInactiveBrowserAfterSecs: 30, }); });