feat: Add AdaptivePlaywrightCrawler (#872)

Add AdaptivePlaywrightCrawler. Adaptive crawler can choose to crawl page with either static crawler(like BeautifulSoupCrawler or ParselCrawler) or browser-based PlaywrightCrawler. --------- Co-authored-by: Jan Buchar <[email protected]>
apify · Feb 7, 2025 · 5ba70b6 · 5ba70b6
1 parent fd0193f
commit 5ba70b6
Show file tree

Hide file tree

Showing 20 changed files with 1,596 additions and 64 deletions.
diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py
@@ -0,0 +1,57 @@
+import asyncio
+
+from playwright.async_api import Route
+
+from crawlee.crawlers import (
+    AdaptivePlaywrightCrawler,
+    AdaptivePlaywrightCrawlingContext,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+
+
+async def main() -> None:
+    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
+        max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
+    )
+
+    @crawler.router.handler(label='label')
+    async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
+        # Do some processing using `page`
+        some_locator = context.page.locator('div').first
+        await some_locator.wait_for()
+        # Do stuff with locator...
+        context.log.info(f'Playwright processing of: {context.request.url} ...')
+
+    @crawler.router.default_handler
+    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+        context.log.info(f'User handler processing: {context.request.url} ...')
+        # Do some processing using `parsed_content`
+        context.log.info(context.parsed_content.title)
+
+        # Find more links and enqueue them.
+        await context.enqueue_links()
+        await context.push_data({'Top crawler Url': context.request.url})
+
+    @crawler.pre_navigation_hook
+    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+        """Hook executed both in static sub crawler and playwright sub crawler."""
+        # Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled
+        # without playwright.
+        context.log.info(f'pre navigation hook for: {context.request.url} ...')
+
+    @crawler.pre_navigation_hook(playwright_only=True)
+    async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+        """Hook executed only in playwright sub crawler."""
+
+        async def some_routing_function(route: Route) -> None:
+            await route.continue_()
+
+        await context.page.route('*/**', some_routing_function)
+        context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...')
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -565,3 +565,7 @@ class BasicCrawlingContext:
 
     log: logging.Logger
     """Logger instance."""
+
+    def __hash__(self) -> int:
+        """Return hash of the context. Each context is considered unique."""
+        return id(self)
diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
@@ -18,10 +18,31 @@
 with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'):
     from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
 
+with _try_import(
+    __name__,
+    'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlingContext',
+    'AdaptivePlaywrightPreNavCrawlingContext',
+    'RenderingType',
+    'RenderingTypePrediction',
+    'RenderingTypePredictor',
+):
+    from ._adaptive_playwright import (
+        AdaptivePlaywrightCrawler,
+        AdaptivePlaywrightCrawlingContext,
+        AdaptivePlaywrightPreNavCrawlingContext,
+        RenderingType,
+        RenderingTypePrediction,
+        RenderingTypePredictor,
+    )
+
 
 __all__ = [
     'AbstractHttpCrawler',
     'AbstractHttpParser',
+    'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlingContext',
+    'AdaptivePlaywrightPreNavCrawlingContext',
     'BasicCrawler',
     'BasicCrawlerOptions',
     'BasicCrawlingContext',
@@ -39,4 +60,7 @@
     'PlaywrightCrawler',
     'PlaywrightCrawlingContext',
     'PlaywrightPreNavCrawlingContext',
+    'RenderingType',
+    'RenderingTypePrediction',
+    'RenderingTypePredictor',
 ]
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Generic
 
 from pydantic import ValidationError
-from typing_extensions import NotRequired, TypeVar
+from typing_extensions import NotRequired, TypedDict, TypeVar
 
 from crawlee import EnqueueStrategy, RequestTransformAction
 from crawlee._request import Request, RequestOptions
@@ -14,6 +14,7 @@
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
 from crawlee.http_clients import HttpxHttpClient
+from crawlee.statistics import StatisticsState
 
 from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult
 
@@ -27,24 +28,33 @@
     from ._abstract_http_parser import AbstractHttpParser
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
+TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 
 
-@docs_group('Data structures')
-class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
-    """Arguments for the `AbstractHttpCrawler` constructor.
-
-    It is intended for typing forwarded `__init__` arguments in the subclasses.
-    """
-
+class _HttpCrawlerAdditionalOptions(TypedDict):
     additional_http_error_status_codes: NotRequired[Iterable[int]]
     """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
 
     ignore_http_error_status_codes: NotRequired[Iterable[int]]
     """HTTP status codes that are typically considered errors but should be treated as successful responses."""
 
 
+@docs_group('Data structures')
+class HttpCrawlerOptions(
+    Generic[TCrawlingContext, TStatisticsState],
+    _HttpCrawlerAdditionalOptions,
+    BasicCrawlerOptions[TCrawlingContext, StatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+
+
 @docs_group('Abstract classes')
-class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
+class AbstractHttpCrawler(
+    Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
+):
     """A web crawler for performing HTTP requests.
 
     The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
@@ -65,7 +75,7 @@ def __init__(
         parser: AbstractHttpParser[TParseResult],
         additional_http_error_status_codes: Iterable[int] = (),
         ignore_http_error_status_codes: Iterable[int] = (),
-        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
+        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
@@ -87,6 +97,32 @@ def __init__(
         kwargs.setdefault('_logger', logging.getLogger(__name__))
         super().__init__(**kwargs)
 
+    @classmethod
+    def create_parsed_http_crawler_class(
+        cls,
+        static_parser: AbstractHttpParser[TParseResult],
+    ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]:
+        """Convenience class factory that creates specific version of `AbstractHttpCrawler` class.
+
+        In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other.
+        This is convenience constructor for specific cases when `TParseResult` is used to specify both generic
+        parameters in `AbstractHttpCrawler`.
+        """
+
+        class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]):
+            def __init__(
+                self,
+                parser: AbstractHttpParser[TParseResult] = static_parser,
+                **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
+            ) -> None:
+                kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
+                super().__init__(
+                    parser=parser,
+                    **kwargs,
+                )
+
+        return _ParsedHttpCrawler
+
     def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
         """Create static content crawler context pipeline with expected pipeline steps."""
         return (

diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py
@@ -0,0 +1,22 @@
+try:
+    from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
+except ImportError as exc:
+    raise ImportError(
+        "To import this, you need to install the 'adaptive-playwright' extra. "
+        "For example, if you use pip, run `pip install 'crawlee[adaptive-playwright]'`.",
+    ) from exc
+
+from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+from ._adaptive_playwright_crawling_context import (
+    AdaptivePlaywrightCrawlingContext,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+
+__all__ = [
+    'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlingContext',
+    'AdaptivePlaywrightPreNavCrawlingContext',
+    'RenderingType',
+    'RenderingTypePrediction',
+    'RenderingTypePredictor',
+]