Skip to content

Commit

Permalink
Two new HeadlessBrowserLoaderHelper features
Browse files Browse the repository at this point in the history
* New method `HeadlessBrowserLoaderHelper::setPageInitScript()`
  (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide
  javascript code that is executed on every new browser page before
  navigating anywhere.
* New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()`
  (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow
  using the native `User-Agent` that your Chrome browser sends by
  default.
  • Loading branch information
otsch committed Jan 3, 2025
1 parent be8905b commit 35365b5
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### [3.1.0] - 2025-01-03
### Added
* New method `HeadlessBrowserLoaderHelper::setPageInitScript()` (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide javascript code that is executed on every new browser page before navigating anywhere.
* New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()` (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow using the native `User-Agent` that your Chrome browser sends by default.

### [3.0.4] - 2024-12-18
### Fixed
* Minor improvement for the `DomQuery` (base for `Dom::cssSelector()` and `Dom::xPath()`): enable providing an empty string as selector, to simply get the node that the selector is applied to.
Expand Down
30 changes: 29 additions & 1 deletion src/Loader/Http/HeadlessBrowserLoaderHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class HeadlessBrowserLoaderHelper

protected int $timeout = 30_000;

protected ?string $pageInitScript = null;

protected bool $useNativeUserAgent = false;

/**
* @var Closure[]
*/
Expand Down Expand Up @@ -207,6 +211,24 @@ public function sanitizeResponseHeaders(array $headers): array
return $headers;
}

/**
* @param string $scriptSource
* @return $this
*/
public function setPageInitScript(string $scriptSource): static
{
$this->pageInitScript = $scriptSource;

return $this;
}

public function useNativeUserAgent(): static
{
$this->useNativeUserAgent = true;

return $this;
}

/**
* @throws OperationTimedOut
* @throws CommunicationException
Expand Down Expand Up @@ -273,6 +295,10 @@ protected function getBrowser(

$this->browser = $this->browserFactory->createBrowser($options);

if ($this->pageInitScript) {
$this->browser->setPagePreScript($this->pageInitScript);
}

$this->optionsDirty = false;
}

Expand All @@ -292,8 +318,10 @@ protected function optionsFromRequest(RequestInterface $request, ?string $proxy
{
$options = $this->options;

if (isset($request->getHeader('User-Agent')[0])) {
if (isset($request->getHeader('User-Agent')[0]) && !$this->useNativeUserAgent) {
$options['userAgent'] = $request->getHeader('User-Agent')[0];
} elseif ($this->useNativeUserAgent && !empty($request->getHeader('User-Agent'))) {
$request = $request->withoutHeader('User-Agent');
}

$options['headers'] = array_merge(
Expand Down
97 changes: 96 additions & 1 deletion tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,29 @@

function helper_setUpHeadlessChromeMocks(
?Closure $pageNavigationArgsClosure = null,
?Closure $createBrowserArgsExpectationCallback = null,
?Closure $browserMockCallback = null,
): BrowserFactory {
$browserFactoryMock = Mockery::mock(BrowserFactory::class);

$browserMock = Mockery::mock(ProcessAwareBrowser::class);

$browserFactoryMock->shouldReceive('createBrowser')->andReturn($browserMock);
$createBrowserExpectation = $browserFactoryMock->shouldReceive('createBrowser');

if ($createBrowserArgsExpectationCallback) {
$createBrowserExpectation->withArgs($createBrowserArgsExpectationCallback);
}

$createBrowserExpectation->andReturn($browserMock);

$pageMock = Mockery::mock(Page::class);

$browserMock->shouldReceive('createPage')->andReturn($pageMock);

if ($browserMockCallback) {
$browserMockCallback($browserMock);
}

$sessionMock = Mockery::mock(Session::class);

$pageMock->shouldReceive('getSession')->andReturn($sessionMock);
Expand Down Expand Up @@ -173,3 +185,86 @@ function (Page $page) use (& $hook3Called) {
->and($hook2Called)->toBeFalse()
->and($hook3Called)->toBeFalse();
});

it(
'passes the script source provided via the setPageInitScript() method, to the ' .
'ProcessAwareBrowser::setPagePreScript() method',
function () {
$script = 'console.log(\'hey\');';

$browserFactoryMock = helper_setUpHeadlessChromeMocks(
browserMockCallback: function (Mockery\MockInterface $browser) use ($script) {
$browser
->shouldReceive('setPagePreScript')
->once()
->with($script);
},
);

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$helper->setPageInitScript($script);

$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar'),
helper_getMinThrottler(),
);
},
);

it('does not call the ProcessAwareBrowser::setPagePreScript() when no page init script was defined', function () {
$browserFactoryMock = helper_setUpHeadlessChromeMocks(
browserMockCallback: function (Mockery\MockInterface $browser) {
$browser->shouldNotReceive('setPagePreScript');
},
);

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar'),
helper_getMinThrottler(),
);
});

it(
'passes the userAgent option when Request contains a user-agent header and useNativeUserAgent() was not called',
function () {
$browserFactoryMock = helper_setUpHeadlessChromeMocks(
createBrowserArgsExpectationCallback: function ($options) {
return array_key_exists('userAgent', $options) && $options['userAgent'] === 'MyBot';
},
);

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
helper_getMinThrottler(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
},
);

it(
'does not pass the userAgent option when Request contains a user-agent header and useNativeUserAgent() was called',
function () {
$browserFactoryMock = helper_setUpHeadlessChromeMocks(
createBrowserArgsExpectationCallback: function ($options) {
return !array_key_exists('userAgent', $options);
},
);

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$helper->useNativeUserAgent();

$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
helper_getMinThrottler(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
},
);
42 changes: 42 additions & 0 deletions tests/_Integration/Http/HeadlessBrowserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,30 @@ function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string $domain)
->and($results[0]->get('responseBody')['User-Agent'])->toBe('HeadlessBrowserBot');
});

it(
'does not use the user-agent defined in the crawler, when useNativeUserAgent() was called on the browser loader ' .
'helper',
function () {
$crawler = new HeadlessBrowserCrawler();

$crawler
->getLoader()
->browser()
->useNativeUserAgent();

$crawler->input('http://localhost:8000/print-headers')
->addStep(Http::get())
->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody'));

$results = helper_generatorToArray($crawler->run());

expect($results)->toHaveCount(1)
->and($results[0]->get('responseBody'))->toBeArray()
->and($results[0]->get('responseBody'))->toHaveKey('User-Agent')
->and($results[0]->get('responseBody')['User-Agent'])->toStartWith('Mozilla/5.0 (');
},
);

it('uses cookies', function () {
$crawler = new HeadlessBrowserCrawler();

Expand Down Expand Up @@ -263,3 +287,21 @@ protected function invoke(mixed $input): Generator

expect($body)->toContain('<div id="delayed_container">hooray</div>');
});

it('executes the javascript code provided via HeadlessBrowserLoaderHelper::setPageInitScript()', function () {
$crawler = new HeadlessBrowserCrawler();

$crawler
->getLoader()
->browser()
->setPageInitScript('window._secret_content = \'secret content\'');

$crawler
->input('http://localhost:8000/page-init-script')
->addStep(Http::get())
->addStep(Html::root()->extract(['content' => '#content']));

$results = helper_generatorToArray($crawler->run());

expect($results[0]->get('content'))->toBe('secret content');
});
4 changes: 4 additions & 0 deletions tests/_Integration/Server.php
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,7 @@ function getParamAfter(string $route, string $after): string
if (str_starts_with($route, '/non-utf-8-charset')) {
return include(__DIR__ . '/_Server/NonUtf8.php');
}

if (str_starts_with($route, '/page-init-script')) {
return include(__DIR__ . '/_Server/PageInitScript.php');
}
11 changes: 11 additions & 0 deletions tests/_Integration/_Server/PageInitScript.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!Doctype html>
<html>
<head>
</head>
<body>
<div id="content"></div>
<script>
document.getElementById('content').innerHTML = window._secret_content;
</script>
</body>
</html>

0 comments on commit 35365b5

Please sign in to comment.