From c69421bcbf6a4b85e50e1f7f94c9ae1e6f01928b Mon Sep 17 00:00:00 2001 From: Giorgos Komninos Date: Wed, 1 Jan 2025 10:35:42 +0200 Subject: [PATCH] feat: Add specific browser headers for StealthMode --- adapters/fetchers/stealth/browsers.go | 113 ++++++++++++++++++++++++++ adapters/fetchers/stealth/stealth.go | 14 +++- scrapemateapp/config.go | 5 +- scrapemateapp/scrapemateapp.go | 2 +- 4 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 adapters/fetchers/stealth/browsers.go diff --git a/adapters/fetchers/stealth/browsers.go b/adapters/fetchers/stealth/browsers.go new file mode 100644 index 0000000..07c3cf8 --- /dev/null +++ b/adapters/fetchers/stealth/browsers.go @@ -0,0 +1,113 @@ +package stealth + +import "github.com/Noooste/azuretls-client" + +type settings struct { + browser string + headers azuretls.OrderedHeaders +} + +func newSettings(browser string) settings { + ans := settings{ + browser: browser, + } + + switch browser { + case azuretls.Chrome: + ans.headers = chromeHeaders() + case azuretls.Firefox: + ans.headers = firefoxHeaders() + case azuretls.Opera: + ans.headers = operaHeaders() + case azuretls.Safari: + ans.headers = safariHeaders() + case azuretls.Edge: + ans.headers = edgeHeaders() + } + + return ans +} + +func edgeHeaders() azuretls.OrderedHeaders { + return azuretls.OrderedHeaders{ + {"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}, + {"Accept-Encoding", "gzip, deflate, br"}, + {"Accept-Language", "en-US,en;q=0.9"}, + {"Cache-Control", "max-age=0"}, + {"Sec-Ch-Ua", `"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"`}, + {"Sec-Ch-Ua-Mobile", "?0"}, + {"Sec-Ch-Ua-Platform", `"Windows"`}, + {"Sec-Fetch-Dest", "document"}, + {"Sec-Fetch-Mode", "navigate"}, + {"Sec-Fetch-Site", "none"}, + {"Sec-Fetch-User", "?1"}, + {"Upgrade-Insecure-Requests", "1"}, + {"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"}, + } +} + +func chromeHeaders() azuretls.OrderedHeaders { + return azuretls.OrderedHeaders{ + {"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}, + {"Accept-Encoding", "gzip, deflate, br"}, + {"Accept-Language", "en-US,en;q=0.9"}, + {"Cache-Control", "max-age=0"}, + {"Sec-Ch-Ua", `"Google Chrome";v="120", "Chromium";v="120", "Not?A_Brand";v="24"`}, + {"Sec-Ch-Ua-Mobile", "?0"}, + {"Sec-Ch-Ua-Platform", `"Windows"`}, + {"Sec-Fetch-Dest", "document"}, + {"Sec-Fetch-Mode", "navigate"}, + {"Sec-Fetch-Site", "none"}, + {"Sec-Fetch-User", "?1"}, + {"Upgrade-Insecure-Requests", "1"}, + {"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, + } +} + +func firefoxHeaders() azuretls.OrderedHeaders { + return azuretls.OrderedHeaders{ + {"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"}, + {"Accept-Encoding", "gzip, deflate, br"}, + {"Accept-Language", "en-US,en;q=0.5"}, + {"Cache-Control", "max-age=0"}, + {"DNT", "1"}, + {"Sec-Fetch-Dest", "document"}, + {"Sec-Fetch-Mode", "navigate"}, + {"Sec-Fetch-Site", "none"}, + {"Sec-Fetch-User", "?1"}, + {"Upgrade-Insecure-Requests", "1"}, + {"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"}, + } +} + +func operaHeaders() azuretls.OrderedHeaders { + return azuretls.OrderedHeaders{ + {"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"}, + {"Accept-Encoding", "gzip, deflate, br"}, + {"Accept-Language", "en-US,en;q=0.9"}, + {"Cache-Control", "max-age=0"}, + {"Sec-Ch-Ua", `"Opera";v="103", "Chromium";v="117", "Not;A=Brand";v="8"`}, + {"Sec-Ch-Ua-Mobile", "?0"}, + {"Sec-Ch-Ua-Platform", `"Windows"`}, + {"Sec-Fetch-Dest", "document"}, + {"Sec-Fetch-Mode", "navigate"}, + {"Sec-Fetch-Site", "none"}, + {"Sec-Fetch-User", "?1"}, + {"Upgrade-Insecure-Requests", "1"}, + {"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 OPR/103.0.0.0"}, + } +} + +func safariHeaders() azuretls.OrderedHeaders { + return azuretls.OrderedHeaders{ + {"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}, + {"Accept-Encoding", "gzip, deflate, br"}, + {"Accept-Language", "en-US,en;q=0.9"}, + {"Cache-Control", "max-age=0"}, + {"Sec-Fetch-Dest", "document"}, + {"Sec-Fetch-Mode", "navigate"}, + {"Sec-Fetch-Site", "none"}, + {"Upgrade-Insecure-Requests", "1"}, + {"User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"}, + } +} diff --git a/adapters/fetchers/stealth/stealth.go b/adapters/fetchers/stealth/stealth.go index 404ba2d..27a8234 100644 --- a/adapters/fetchers/stealth/stealth.go +++ b/adapters/fetchers/stealth/stealth.go @@ -12,10 +12,17 @@ import ( ) type stealthFetch struct { + browserSettings settings } -func New() scrapemate.HTTPFetcher { - return &stealthFetch{} +func New(browser ...string) scrapemate.HTTPFetcher { + ans := stealthFetch{} + + if len(browser) > 0 { + ans.browserSettings = newSettings(browser[0]) + } + + return &ans } func (o *stealthFetch) Close() error { @@ -36,7 +43,8 @@ func (o *stealthFetch) Fetch(ctx context.Context, job scrapemate.IJob) scrapemat defer session.Close() - session.Browser = azuretls.Firefox + session.Browser = o.browserSettings.browser + session.OrderedHeaders = o.browserSettings.headers req := azuretls.Request{ Method: job.GetMethod(), diff --git a/scrapemateapp/config.go b/scrapemateapp/config.go index f8d50c0..91a4e37 100644 --- a/scrapemateapp/config.go +++ b/scrapemateapp/config.go @@ -60,9 +60,10 @@ func WithJS(opts ...func(*jsOptions)) func(*Config) error { } } -func WithStealth() func(*Config) error { +func WithStealth(browser string) func(*Config) error { return func(o *Config) error { o.UseStealth = true + o.StealthBrowser = browser return o.validate() } @@ -147,6 +148,8 @@ type Config struct { // UseStealth is whether to use stealth mode to scrape the page. // uses a special http client to scrape the page. UseStealth bool `validate:"omitempty"` + // StealthBrowser is the browser to use for stealth mode. + StealthBrowser string `validate:"omitempty"` // JSOpts are the options for the JavaScript renderer. JSOpts jsOptions diff --git a/scrapemateapp/scrapemateapp.go b/scrapemateapp/scrapemateapp.go index f38c229..a8ef095 100644 --- a/scrapemateapp/scrapemateapp.go +++ b/scrapemateapp/scrapemateapp.go @@ -168,7 +168,7 @@ func (app *ScrapemateApp) getFetcher() (scrapemate.HTTPFetcher, error) { } default: if app.cfg.UseStealth { - httpFetcher = stealth.New() + httpFetcher = stealth.New(app.cfg.StealthBrowser) } else { cookieJar, err := cookiejar.New(nil) if err != nil {