Skip to content

Commit

Permalink
Fix HttpLoader::dontUseCookies() with browser
Browse files Browse the repository at this point in the history
`HttpLoader::dontUseCookies()` now also works when using the Chrome
browser. Cookies are cleared before every request.
  • Loading branch information
otsch committed Jan 10, 2025
1 parent 42e0e70 commit e5701a6
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [3.1.4] - 2025-01-10
### Fixed
* `HttpLoader::dontUseCookies()` now also works when using the Chrome browser. Cookies are cleared before every request.

## [3.1.3] - 2025-01-10
### Fixed
* Further improve getting the raw response body from non-HTML documents via Chrome browser.
Expand Down
4 changes: 4 additions & 0 deletions src/Loader/Http/HeadlessBrowserLoaderHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ public function navigateToPageAndGetRespondedRequest(
): RespondedRequest {
$this->page = $this->getBrowser($request, $proxy)->createPage();

if ($cookieJar === null) {
$this->page->getSession()->sendMessageSync(new Message('Network.clearBrowserCookies'));
}

$statusCode = 200;

$responseHeaders = [];
Expand Down
2 changes: 1 addition & 1 deletion src/Loader/Http/HttpLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): Re
$request,
$this->throttler,
$proxy,
$this->cookieJar,
$this->useCookies ? $this->cookieJar : null,
);
}

Expand Down
40 changes: 40 additions & 0 deletions tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
namespace tests\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\HeadlessBrowserLoaderHelper;
use Crwlr\Crawler\Steps\Loading\Http;
use Exception;
use GuzzleHttp\Psr7\Request;
use HeadlessChromium\AutoDiscover;
use HeadlessChromium\Browser\ProcessAwareBrowser;
use HeadlessChromium\BrowserFactory;
use HeadlessChromium\Communication\Message;
use HeadlessChromium\Communication\Session;
use HeadlessChromium\Cookies\CookiesCollection;
use HeadlessChromium\Page;
use HeadlessChromium\PageUtils\PageNavigation;
use Mockery;
Expand All @@ -21,6 +24,7 @@ function helper_setUpHeadlessChromeMocks(
?Closure $pageNavigationArgsClosure = null,
?Closure $createBrowserArgsExpectationCallback = null,
?Closure $browserMockCallback = null,
?Closure $pageSessionMockCallback = null,
): BrowserFactory {
$browserFactoryMock = Mockery::mock(BrowserFactory::class);

Expand All @@ -46,6 +50,12 @@ function helper_setUpHeadlessChromeMocks(

$pageMock->shouldReceive('getSession')->andReturn($sessionMock);

if ($pageSessionMockCallback) {
$pageSessionMockCallback($sessionMock);
}

$pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection([]));

$sessionMock->shouldReceive('once');

$pageNavigationMock = Mockery::mock(PageNavigation::class);
Expand Down Expand Up @@ -75,6 +85,7 @@ function helper_setUpHeadlessChromeMocks(
$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
Expand Down Expand Up @@ -104,6 +115,7 @@ function helper_setUpHeadlessChromeMocks(
$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
Expand Down Expand Up @@ -168,6 +180,7 @@ function (Page $page) use (& $hook3Called) {
$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect($hook1Called)->toBeTrue()
Expand All @@ -179,6 +192,7 @@ function (Page $page) use (& $hook3Called) {
$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect($hook1Called)->toBeFalse()
Expand Down Expand Up @@ -208,6 +222,7 @@ function () {
$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);
},
);
Expand All @@ -224,6 +239,7 @@ function () {
$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar'),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);
});

Expand All @@ -241,6 +257,7 @@ function () {
$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
Expand All @@ -263,8 +280,31 @@ function () {
$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
helper_getMinThrottler(),
cookieJar: new CookieJar(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
},
);

it('clears the browsers cookies when no cookie jar is provided', function () {
$browserFactoryMock = helper_setUpHeadlessChromeMocks(
pageSessionMockCallback: function (Mockery\MockInterface $mock) {
$mock
->shouldReceive('sendMessageSync')
->once()
->withArgs(function (Message $message) {
return $message->getMethod() === 'Network.clearBrowserCookies';
});
},
);

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$response = $helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/yolo', ['user-agent' => ['MyBot']]),
helper_getMinThrottler(),
);

expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
});
23 changes: 23 additions & 0 deletions tests/_Integration/Http/HeadlessBrowserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,29 @@ protected function invoke(mixed $input): Generator
->and($results[0]->get('printed-cookie'))->toBe('foo123');
});

it('does not use cookies when HttpLoader::dontUseCookies() was called', function () {
$crawler = new HeadlessBrowserCrawler();

$crawler->getLoader()->dontUseCookies();

$crawler
->input('http://localhost:8000/set-cookie')
->addStep(Http::get())
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));

$results = helper_generatorToArray($crawler->run());

expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeEmpty();
});

it('renders javascript', function () {
$crawler = new HeadlessBrowserCrawler();

Expand Down

0 comments on commit e5701a6

Please sign in to comment.