Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Screenshot Browser Action and Http Step Improvements #196

Merged
merged 3 commits into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [3.4.0] - 2025-03-06
### Added
* Two new methods to the base class of all `Http` steps:
* `skipCache()` – Allows using the cache while skipping it for a specific loading step.
* `useBrowser()` – Switches the loader to use a (headless) Chrome browser for loading calls in a specific step and then reverts the loader to its previous setting.
* Introduced the new `BrowserAction::screenshot()` post browser navigate hook. It accepts an instance of the new `ScreenshotConfig` class, allowing you to configure various options (see the methods of `ScreenshotConfig`). If successful, the screenshot file paths are included in the `RespondedRequest` output object of the `Http` step.

## [3.3.0] - 2025-03-02
### Added
* New `BrowserAction`s to use with the `postBrowserNavigateHook()` method:
Expand Down
10 changes: 10 additions & 0 deletions src/Loader/Http/Browser/Screenshot.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php

namespace Crwlr\Crawler\Loader\Http\Browser;

class Screenshot
{
public function __construct(
public readonly string $path,
) {}
}
86 changes: 86 additions & 0 deletions src/Loader/Http/Browser/ScreenshotConfig.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
<?php

namespace Crwlr\Crawler\Loader\Http\Browser;

use Crwlr\Utils\Microseconds;
use HeadlessChromium\Clip;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Page;

class ScreenshotConfig
{
public function __construct(
public string $storePath,
public string $fileType = 'png',
public ?int $quality = null,
public bool $fullPage = false,
) {}

public static function make(string $storePath): self
{
return new self($storePath);
}

/**
* @throws CannotReadResponse
* @throws InvalidResponse
*/
public function getFullPath(Page $page): string
{
$filename = md5($page->getCurrentUrl()) . '-' . Microseconds::now()->value . '.' . $this->fileType;

return $this->storePath . (!str_ends_with($this->storePath, '/') ? '/' : '') . $filename;
}

public function setImageFileType(string $type): self
{
if (in_array($type, ['jpeg', 'png', 'webp'], true)) {
$this->fileType = $type;

if (in_array($type, ['jpeg', 'webp'], true) && $this->quality === null) {
$this->quality = 80;
} elseif ($type === 'png' && $this->quality !== null) {
$this->quality = null;
}
}

return $this;
}

public function setQuality(int $quality): self
{
if (in_array($this->fileType, ['jpeg', 'webp'], true) && $quality > 0 && $quality <= 100) {
$this->quality = $quality;
}

return $this;
}

public function setFullPage(): self
{
$this->fullPage = true;

return $this;
}

/**
* @return array<string, int|string|bool|Clip>
*/
public function toChromePhpScreenshotConfig(Page $page): array
{
$config = ['format' => $this->fileType];

if ($this->quality && in_array($this->fileType, ['jpeg', 'webp'], true)) {
$config['quality'] = $this->quality;
}

if ($this->fullPage) {
$config['captureBeyondViewport'] = true;

$config['clip'] = $page->getFullPageClip();
}

return $config;
}
}
34 changes: 29 additions & 5 deletions src/Loader/Http/HeadlessBrowserLoaderHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Crwlr\Crawler\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
Expand All @@ -24,6 +25,7 @@
use HeadlessChromium\Page;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Throwable;

class HeadlessBrowserLoaderHelper
Expand Down Expand Up @@ -60,7 +62,10 @@ class HeadlessBrowserLoaderHelper
*/
protected array $tempPostNavigateHooks = [];

public function __construct(private ?BrowserFactory $browserFactory = null) {}
public function __construct(
private ?BrowserFactory $browserFactory = null,
protected ?LoggerInterface $logger = null,
) {}

/**
* Set temporary post navigate hooks
Expand Down Expand Up @@ -131,7 +136,7 @@ function ($params) use (&$statusCode, &$responseHeaders, &$requestId) {

$throttler->trackRequestEndFor($request->getUri());

$this->callPostNavigateHooks();
$hookActionData = $this->callPostNavigateHooks();

if (is_string($requestId) && $this->page && !$this->responseIsHtmlDocument($this->page)) {
$html = $this->tryToGetRawResponseBody($this->page, $requestId) ?? $this->getHtmlFromPage();
Expand All @@ -141,7 +146,11 @@ function ($params) use (&$statusCode, &$responseHeaders, &$requestId) {

$this->addCookiesToJar($cookieJar, $request->getUri());

return new RespondedRequest($request, new Response($statusCode, $responseHeaders, $html));
return new RespondedRequest(
$request,
new Response($statusCode, $responseHeaders, $html),
$hookActionData['screenshots'] ?? [],
);
}

public function getOpenBrowser(): ?Browser
Expand Down Expand Up @@ -280,15 +289,30 @@ protected function navigate(string $url): void
}
}

protected function callPostNavigateHooks(): void
/**
* @return array<string, mixed>
*/
protected function callPostNavigateHooks(): array
{
$returnData = [];

if (!empty($this->tempPostNavigateHooks)) {
foreach ($this->tempPostNavigateHooks as $hook) {
$hook->call($this, $this->page);
$returnValue = $hook->call($this, $this->page, $this->logger);

if ($returnValue instanceof Screenshot) {
if (!array_key_exists('screenshots', $returnData)) {
$returnData['screenshots'] = [$returnValue];
} else {
$returnData['screenshots'][] = $returnValue;
}
}
}
}

$this->tempPostNavigateHooks = [];

return $returnData;
}

/**
Expand Down
17 changes: 16 additions & 1 deletion src/Loader/Http/HttpLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ class HttpLoader extends Loader
*/
protected array $cacheUrlFilters = [];

protected bool $skipCacheForNextRequest = false;

protected ?ProxyManager $proxies = null;

/**
Expand Down Expand Up @@ -303,7 +305,7 @@ public function useRotatingProxies(array $proxyUrls): void
public function browser(): HeadlessBrowserLoaderHelper
{
if (!$this->browserHelper) {
$this->browserHelper = new HeadlessBrowserLoaderHelper();
$this->browserHelper = new HeadlessBrowserLoaderHelper(logger: $this->logger);
}

return $this->browserHelper;
Expand All @@ -319,6 +321,13 @@ public function addToCache(RespondedRequest $respondedRequest): void
}
}

public function skipCacheForNextRequest(): static
{
$this->skipCacheForNextRequest = true;

return $this;
}

/**
* @throws LoadingException|Throwable|\Psr\SimpleCache\InvalidArgumentException
*/
Expand All @@ -340,6 +349,8 @@ protected function tryLoading(
$this->callHook('onCacheHit', $request, $respondedRequest->response);
}

$this->skipCacheForNextRequest = false;

if (!$respondedRequest) {
$respondedRequest = $this->waitForGoAndLoad($request);
}
Expand Down Expand Up @@ -582,6 +593,10 @@ protected function shouldResponseBeCached(RespondedRequest $respondedRequest): b

protected function shouldRequestBeServedFromCache(RequestInterface $request): bool
{
if ($this->skipCacheForNextRequest === true) {
return false;
}

if (!empty($this->cacheUrlFilters)) {
foreach ($this->cacheUrlFilters as $filter) {
if (!$filter->evaluate((string) $request->getUri())) {
Expand Down
26 changes: 26 additions & 0 deletions src/Loader/Http/Messages/RespondedRequest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Crwlr\Crawler\Loader\Http\Messages;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Url;
Expand All @@ -22,11 +23,13 @@ class RespondedRequest
protected bool $isServedFromCache = false;

/**
* @param Screenshot[] $screenshots
* @throws Exception
*/
public function __construct(
public RequestInterface $request,
public ResponseInterface $response,
public array $screenshots = [],
) {
$this->setResponse($this->response);
}
Expand All @@ -41,6 +44,7 @@ public static function fromArray(array $data): RespondedRequest
$respondedRequest = new RespondedRequest(
self::requestFromArray($data),
self::responseFromArray($data),
self::screenshotsFromArray($data),
);

if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
Expand All @@ -65,6 +69,7 @@ public function __serialize(): array
'responseStatusCode' => $this->response->getStatusCode(),
'responseHeaders' => $this->response->getHeaders(),
'responseBody' => Http::getBodyString($this->response),
'screenshots' => array_map(fn(Screenshot $screenshot) => $screenshot->path, $this->screenshots),
];
}

Expand Down Expand Up @@ -104,6 +109,8 @@ public function __unserialize(array $data): void
if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
$this->addRedirectUri($data['effectiveUri']);
}

$this->screenshots = self::screenshotsFromArray($data);
}

public function effectiveUri(): string
Expand Down Expand Up @@ -209,4 +216,23 @@ protected static function responseFromArray(array $data): Response
$data['responseBody'],
);
}

/**
* @param mixed[] $data
* @return Screenshot[]
*/
protected static function screenshotsFromArray(array $data): array
{
$screenshots = [];

if (array_key_exists('screenshots', $data)) {
foreach ($data['screenshots'] as $screenshot) {
if (file_exists($screenshot)) {
$screenshots[] = new Screenshot($screenshot);
}
}
}

return $screenshots;
}
}
Loading