Skip to content

Commit

Permalink
feat: Add AdaptivePlaywrightCrawler (#872)
Browse files Browse the repository at this point in the history
Add AdaptivePlaywrightCrawler. Adaptive crawler can choose to crawl page with either static crawler(like BeautifulSoupCrawler or ParselCrawler) or browser-based PlaywrightCrawler.

---------

Co-authored-by: Jan Buchar <[email protected]>
  • Loading branch information
Pijukatel and janbuchar authored Feb 7, 2025
1 parent fd0193f commit 5ba70b6
Show file tree
Hide file tree
Showing 20 changed files with 1,596 additions and 64 deletions.
57 changes: 57 additions & 0 deletions docs/examples/code/adaptive_playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import asyncio

from playwright.async_api import Route

from crawlee.crawlers import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)


async def main() -> None:
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
)

@crawler.router.handler(label='label')
async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
# Do some processing using `page`
some_locator = context.page.locator('div').first
await some_locator.wait_for()
# Do stuff with locator...
context.log.info(f'Playwright processing of: {context.request.url} ...')

@crawler.router.default_handler
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info(f'User handler processing: {context.request.url} ...')
# Do some processing using `parsed_content`
context.log.info(context.parsed_content.title)

# Find more links and enqueue them.
await context.enqueue_links()
await context.push_data({'Top crawler Url': context.request.url})

@crawler.pre_navigation_hook
async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
"""Hook executed both in static sub crawler and playwright sub crawler."""
# Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled
# without playwright.
context.log.info(f'pre navigation hook for: {context.request.url} ...')

@crawler.pre_navigation_hook(playwright_only=True)
async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
"""Hook executed only in playwright sub crawler."""

async def some_routing_function(route: Route) -> None:
await route.continue_()

await context.page.route('*/**', some_routing_function)
context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])


if __name__ == '__main__':
asyncio.run(main())
4 changes: 4 additions & 0 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,3 +565,7 @@ class BasicCrawlingContext:

log: logging.Logger
"""Logger instance."""

def __hash__(self) -> int:
"""Return hash of the context. Each context is considered unique."""
return id(self)
24 changes: 24 additions & 0 deletions src/crawlee/crawlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,31 @@
with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'):
from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext

with _try_import(
__name__,
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'RenderingType',
'RenderingTypePrediction',
'RenderingTypePredictor',
):
from ._adaptive_playwright import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
RenderingType,
RenderingTypePrediction,
RenderingTypePredictor,
)


__all__ = [
'AbstractHttpCrawler',
'AbstractHttpParser',
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'BasicCrawler',
'BasicCrawlerOptions',
'BasicCrawlingContext',
Expand All @@ -39,4 +60,7 @@
'PlaywrightCrawler',
'PlaywrightCrawlingContext',
'PlaywrightPreNavCrawlingContext',
'RenderingType',
'RenderingTypePrediction',
'RenderingTypePredictor',
]
56 changes: 46 additions & 10 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import TYPE_CHECKING, Any, Callable, Generic

from pydantic import ValidationError
from typing_extensions import NotRequired, TypeVar
from typing_extensions import NotRequired, TypedDict, TypeVar

from crawlee import EnqueueStrategy, RequestTransformAction
from crawlee._request import Request, RequestOptions
Expand All @@ -14,6 +14,7 @@
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.http_clients import HttpxHttpClient
from crawlee.statistics import StatisticsState

from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult

Expand All @@ -27,24 +28,33 @@
from ._abstract_http_parser import AbstractHttpParser

TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)


@docs_group('Data structures')
class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
"""Arguments for the `AbstractHttpCrawler` constructor.
It is intended for typing forwarded `__init__` arguments in the subclasses.
"""

class _HttpCrawlerAdditionalOptions(TypedDict):
additional_http_error_status_codes: NotRequired[Iterable[int]]
"""Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""

ignore_http_error_status_codes: NotRequired[Iterable[int]]
"""HTTP status codes that are typically considered errors but should be treated as successful responses."""


@docs_group('Data structures')
class HttpCrawlerOptions(
Generic[TCrawlingContext, TStatisticsState],
_HttpCrawlerAdditionalOptions,
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
):
"""Arguments for the `AbstractHttpCrawler` constructor.
It is intended for typing forwarded `__init__` arguments in the subclasses.
"""


@docs_group('Abstract classes')
class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
class AbstractHttpCrawler(
Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
):
"""A web crawler for performing HTTP requests.
The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
Expand All @@ -65,7 +75,7 @@ def __init__(
parser: AbstractHttpParser[TParseResult],
additional_http_error_status_codes: Iterable[int] = (),
ignore_http_error_status_codes: Iterable[int] = (),
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
) -> None:
self._parser = parser
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
Expand All @@ -87,6 +97,32 @@ def __init__(
kwargs.setdefault('_logger', logging.getLogger(__name__))
super().__init__(**kwargs)

@classmethod
def create_parsed_http_crawler_class(
cls,
static_parser: AbstractHttpParser[TParseResult],
) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]:
"""Convenience class factory that creates specific version of `AbstractHttpCrawler` class.
In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other.
This is convenience constructor for specific cases when `TParseResult` is used to specify both generic
parameters in `AbstractHttpCrawler`.
"""

class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]):
def __init__(
self,
parser: AbstractHttpParser[TParseResult] = static_parser,
**kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
) -> None:
kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
super().__init__(
parser=parser,
**kwargs,
)

return _ParsedHttpCrawler

def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
"""Create static content crawler context pipeline with expected pipeline steps."""
return (
Expand Down
22 changes: 22 additions & 0 deletions src/crawlee/crawlers/_adaptive_playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
try:
from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
except ImportError as exc:
raise ImportError(
"To import this, you need to install the 'adaptive-playwright' extra. "
"For example, if you use pip, run `pip install 'crawlee[adaptive-playwright]'`.",
) from exc

from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
from ._adaptive_playwright_crawling_context import (
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)

__all__ = [
'AdaptivePlaywrightCrawler',
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
'RenderingType',
'RenderingTypePrediction',
'RenderingTypePredictor',
]
Loading

0 comments on commit 5ba70b6

Please sign in to comment.