From b2bc749e27abd89b5d1815f28acf81d1248df381 Mon Sep 17 00:00:00 2001 From: Giorgos Komninos Date: Sun, 19 Jan 2025 14:05:45 +0200 Subject: [PATCH] feat: Improves proxy pool and adds configs to re-use browser page --- adapters/fetchers/jshttp/jshttp.go | 81 ++++++++++++++++++++---------- scrapemateapp/config.go | 26 ++++++++++ scrapemateapp/scrapemateapp.go | 9 +++- 3 files changed, 89 insertions(+), 27 deletions(-) diff --git a/adapters/fetchers/jshttp/jshttp.go b/adapters/fetchers/jshttp/jshttp.go index 26d9ecd..322986f 100644 --- a/adapters/fetchers/jshttp/jshttp.go +++ b/adapters/fetchers/jshttp/jshttp.go @@ -9,7 +9,7 @@ import ( var _ scrapemate.HTTPFetcher = (*jsFetch)(nil) -func New(headless, disableImages bool, rotator scrapemate.ProxyRotator) (scrapemate.HTTPFetcher, error) { +func New(headless, disableImages bool, rotator scrapemate.ProxyRotator, poolSize, pageReuseLimit, browserReuseLimit int) (scrapemate.HTTPFetcher, error) { opts := []*playwright.RunOptions{ { Browsers: []string{"chromium"}, @@ -20,30 +20,42 @@ func New(headless, disableImages bool, rotator scrapemate.ProxyRotator) (scrapem return nil, err } - const poolSize = 10 - pw, err := playwright.Run() if err != nil { return nil, err } ans := jsFetch{ - pw: pw, - headless: headless, - disableImages: disableImages, - pool: make(chan *browser, poolSize), - rotator: rotator, + pw: pw, + headless: headless, + disableImages: disableImages, + pool: make(chan *browser, poolSize), + rotator: rotator, + pageReuseLimit: pageReuseLimit, + browserReuseLimit: browserReuseLimit, + } + + for i := 0; i < poolSize; i++ { + b, err := newBrowser(pw, headless, disableImages, rotator) + if err != nil { + _ = ans.Close() + return nil, err + } + + ans.pool <- b } return &ans, nil } type jsFetch struct { - pw *playwright.Playwright - headless bool - disableImages bool - pool chan *browser - rotator scrapemate.ProxyRotator + pw *playwright.Playwright + headless bool + disableImages bool + pool chan *browser + rotator scrapemate.ProxyRotator + pageReuseLimit int + browserReuseLimit int } func (o *jsFetch) GetBrowser(ctx context.Context) (*browser, error) { @@ -51,15 +63,15 @@ func (o *jsFetch) GetBrowser(ctx context.Context) (*browser, error) { case <-ctx.Done(): return nil, ctx.Err() case ans := <-o.pool: - return ans, nil - default: - ans, err := newBrowser(o.pw, o.headless, o.disableImages, o.rotator) - if err != nil { - return nil, err + if ans.browser.IsConnected() && (o.browserReuseLimit <= 0 || ans.browserUsage < o.browserReuseLimit) { + return ans, nil } - return ans, nil + ans.browser.Close() + default: } + + return newBrowser(o.pw, o.headless, o.disableImages, o.rotator) } func (o *jsFetch) Close() error { @@ -75,6 +87,12 @@ func (o *jsFetch) Close() error { } func (o *jsFetch) PutBrowser(ctx context.Context, b *browser) { + if !b.browser.IsConnected() { + b.Close() + + return + } + select { case <-ctx.Done(): b.Close() @@ -117,21 +135,32 @@ func (o *jsFetch) Fetch(ctx context.Context, job scrapemate.IJob) scrapemate.Res Error: err, } } + } - // match the browser default timeout to the job timeout - if job.GetTimeout() > 0 { - page.SetDefaultTimeout(float64(job.GetTimeout().Milliseconds())) - } + // match the browser default timeout to the job timeout + if job.GetTimeout() > 0 { + page.SetDefaultTimeout(float64(job.GetTimeout().Milliseconds())) } - defer page.Close() + browser.page0Usage++ + browser.browserUsage++ + + defer func() { + if o.pageReuseLimit == 0 || browser.page0Usage >= o.pageReuseLimit { + _ = page.Close() + + browser.page0Usage = 0 + } + }() return job.BrowserActions(ctx, page) } type browser struct { - browser playwright.Browser - ctx playwright.BrowserContext + browser playwright.Browser + ctx playwright.BrowserContext + page0Usage int + browserUsage int } func (o *browser) Close() { diff --git a/scrapemateapp/config.go b/scrapemateapp/config.go index 91a4e37..1a971e5 100644 --- a/scrapemateapp/config.go +++ b/scrapemateapp/config.go @@ -28,6 +28,24 @@ func NewConfig(writers []scrapemate.ResultWriter, options ...func(*Config) error return &cfg, nil } +// WithBrowserReuseLimit sets the browser reuse limit of the app. +func WithBrowserReuseLimit(limit int) func(*Config) error { + return func(o *Config) error { + o.BrowserReuseLimit = limit + + return nil + } +} + +// WithPageReuseLimit sets the page reuse limit of the app. +func WithPageReuseLimit(limit int) func(*Config) error { + return func(o *Config) error { + o.PageReuseLimit = limit + + return nil + } +} + // WithConcurrency sets the concurrency of the app. func WithConcurrency(concurrency int) func(*Config) error { return func(o *Config) error { @@ -167,6 +185,14 @@ type Config struct { ExitOnInactivityDuration time.Duration // Proxies are the proxies to use for the app. Proxies []string + // BrowserReuseLimit is the limit of browser reuse. + // Only applicable when using JavaScript renderer. + // By default it is 0, which means the browser will be reused indefinitely. + BrowserReuseLimit int + // PageReuseLimit is the limit of page reuse. + // Only applicable when using JavaScript renderer. + // By default it is 0, which means the page will not be reused. + PageReuseLimit int } func (o *Config) validate() error { diff --git a/scrapemateapp/scrapemateapp.go b/scrapemateapp/scrapemateapp.go index a8ef095..543e916 100644 --- a/scrapemateapp/scrapemateapp.go +++ b/scrapemateapp/scrapemateapp.go @@ -162,7 +162,14 @@ func (app *ScrapemateApp) getFetcher() (scrapemate.HTTPFetcher, error) { switch app.cfg.UseJS { case true: - httpFetcher, err = jsfetcher.New(!app.cfg.JSOpts.Headfull, app.cfg.JSOpts.DisableImages, rotator) + httpFetcher, err = jsfetcher.New( + !app.cfg.JSOpts.Headfull, + app.cfg.JSOpts.DisableImages, + rotator, + app.cfg.Concurrency, + app.cfg.PageReuseLimit, + app.cfg.BrowserReuseLimit, + ) if err != nil { return nil, err }