Skip to content

Commit

Permalink
Revert "feat: add resource and file type blocking in browser params"
Browse files Browse the repository at this point in the history
This reverts commit 90bed45.

Intercepting slow down scrapper
  • Loading branch information
Igzak authored and Igzak committed Nov 5, 2024
1 parent 90bed45 commit 39c78b0
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 51 deletions.
27 changes: 10 additions & 17 deletions app/routers/any_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from fastapi import APIRouter, Query, Depends
from fastapi.requests import Request
from pydantic import BaseModel
from playwright.async_api import Browser, Route
from playwright.async_api import Browser

from internal import cache
from internal.browser import (
Expand All @@ -23,8 +23,10 @@
ProxyQueryParams,
)


router = APIRouter(prefix='/api/page', tags=['page'])


class AnyPage(BaseModel):
id: Annotated[str, Query(description='unique result ID')]
url: Annotated[str, Query(description='page URL after redirects, may not match the query URL')]
Expand All @@ -38,24 +40,25 @@ class AnyPage(BaseModel):
title: Annotated[str | None, Query(description="page's title")] = None
status_code: Annotated[int, Query(description='HTTP status code of the page')]


@router.get('', summary='Get any page from the given URL', response_model=AnyPage)
async def get_any_page(
request: Request,
url: Annotated[URLParam, Depends()],
common_params: Annotated[CommonQueryParams, Depends()],
browser_params: Annotated[BrowserQueryParams, Depends()],
proxy_params: Annotated[ProxyQueryParams, Depends()]
proxy_params: Annotated[ProxyQueryParams, Depends()],
) -> dict:
"""
Get any page from the given URL.<br><br>
Page is fetched using Playwright, but no additional processing is done.
"""
# pylint: disable=duplicate-code
# Split URL into parts: host with scheme, path with query, query params as a dict
# split URL into parts: host with scheme, path with query, query params as a dict
host_url, full_path, query_dict = split_url(request.url)

# Get cache data if exists
r_id = cache.make_key(full_path) # Unique result ID
# get cache data if exists
r_id = cache.make_key(full_path) # unique result ID
if common_params.cache:
data = cache.load_result(key=r_id)
if data:
Expand All @@ -64,19 +67,9 @@ async def get_any_page(
browser: Browser = request.state.browser
semaphore: asyncio.Semaphore = request.state.semaphore

# Create a new browser context
# create a new browser context
async with semaphore:
async with new_context(browser, browser_params, proxy_params) as context:
async def block_unwanted_resources(route: Route):
if route.request.resource_type in (browser_params.block_types or []):
await route.abort()
elif route.request.url.lower().endswith(tuple(browser_params.block_extensions or [])):
await route.abort()
else:
await route.continue_()

await context.route("**/*", block_unwanted_resources)

page = await context.new_page()
status = await page_processing(
page=page,
Expand Down Expand Up @@ -106,7 +99,7 @@ async def block_unwanted_resources(route: Route):
if common_params.screenshot:
r['screenshotUri'] = f'{host_url}/screenshot/{r_id}'

# Save result to disk
# save result to disk
if common_params.cache:
cache.dump_result(r, key=r_id, screenshot=screenshot)
return r
34 changes: 0 additions & 34 deletions app/routers/query_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,28 +201,6 @@ def __init__(
)
),
] = WaitUntilEnum.DOMCONTENTLOADED,
block_types: Annotated[
str | None,
Query(
alias='block-types',
description=(
'List of resource types to block on the page.<br>'
'Examples: `image`, `media`, `font`, etc.<br>'
'By default, all resources are allowed.'
),
),
] = None,
block_extensions: Annotated[
str | None,
Query(
alias='block-extensions',
description=(
'List of file extensions to block on the page.<br>'
'Examples: `.pdf`, `.jpg`, `.zip`, etc.<br>'
'By default, no extensions are blocked.'
),
),
] = None,
sleep: Annotated[
int,
Query(
Expand Down Expand Up @@ -382,24 +360,12 @@ def __init__(
self.timezone = timezone
self.http_credentials = None
self.extra_http_headers = None
self.block_types = block_types
self.block_extensions = block_extensions

if resource:
resource = list(filter(None, map(str.strip, resource.split(','))))
if resource:
self.resource = resource

if block_extensions:
block_extensions = list(filter(None, map(str.strip, block_extensions.split(','))))
if block_extensions:
self.block_extensions = block_extensions

if block_types:
block_types = list(filter(None, map(str.strip, block_types.split(','))))
if block_types:
self.block_types = block_types

if device not in DEVICE_REGISTRY:
raise QueryParsingError('device', 'Device not found', device)

Expand Down

0 comments on commit 39c78b0

Please sign in to comment.