-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from leozqin/unretrievable
Detect unretrievable or banned sites and present a proper user agent
- Loading branch information
Showing
30 changed files
with
479 additions
and
272 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +0,0 @@ | ||
from enum import Enum | ||
|
||
from app.content.playwright import PlaywrightContentRetriever | ||
from app.content.requests import RequestsContentRetriever | ||
|
||
content_retrieval_handlers = { | ||
"requests": RequestsContentRetriever, | ||
"playwright": PlaywrightContentRetriever, | ||
} | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,52 @@ | ||
from __future__ import annotations | ||
|
||
from logging import getLogger | ||
|
||
from playwright.async_api import Playwright, Route, async_playwright | ||
|
||
from app.constants import USER_AGENT | ||
from app.handlers import ContentRetrievalHandler | ||
|
||
logger = getLogger("uvicorn.error") | ||
|
||
|
||
class PlaywrightContentRetriever(ContentRetrievalHandler): | ||
id = "playwright" | ||
|
||
@staticmethod | ||
async def _block_common_with_script(route: Route): | ||
excluded_resource_types = ["stylesheet", "image", "font"] | ||
if route.request.resource_type in excluded_resource_types: | ||
await route.abort() | ||
else: | ||
await route.continue_() | ||
|
||
@staticmethod | ||
async def _block_common(route: Route): | ||
excluded_resource_types = ["stylesheet", "script", "image", "font"] | ||
excluded_resource_types = ["stylesheet", "image", "font", "script"] | ||
if route.request.resource_type in excluded_resource_types: | ||
await route.abort() | ||
else: | ||
await route.continue_() | ||
|
||
@staticmethod | ||
async def _retrieve(url: str, playright: Playwright): | ||
async def _retrieve(url: str, playright: Playwright, use_script: bool = False): | ||
browser = await playright.chromium.launch() | ||
page = await browser.new_page() | ||
page = await browser.new_page(user_agent=USER_AGENT) | ||
|
||
retriever = ( | ||
PlaywrightContentRetriever._block_common_with_script | ||
if use_script | ||
else PlaywrightContentRetriever._block_common | ||
) | ||
|
||
await page.route("**/*", PlaywrightContentRetriever._block_common) | ||
await page.route("**/*", retriever) | ||
await page.goto(url) | ||
|
||
await page.wait_for_load_state("domcontentloaded") | ||
|
||
return await page.content() | ||
|
||
async def get_content(self, url: str) -> str: | ||
async def get_html(self, url: str, use_script: bool = False) -> str: | ||
async with async_playwright() as pw: | ||
return await self._retrieve(url=url, playright=pw) | ||
return await self._retrieve(url=url, playright=pw, use_script=use_script) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,21 @@ | ||
import requests | ||
|
||
from app.constants import USER_AGENT | ||
from app.handlers import ContentRetrievalHandler | ||
from app.models import EntryContent, FeedEntry | ||
|
||
|
||
class RequestsContentRetriever(ContentRetrievalHandler): | ||
id = "requests" | ||
headers = {"User-Agent": USER_AGENT} | ||
|
||
async def get_content(self, url: str) -> str: | ||
|
||
page = requests.get(url) | ||
|
||
return page.text | ||
# requests does not implement the use_script option so we'll just ignore it | ||
async def get_html(self, url: str, use_script: bool = False) -> str: | ||
try: | ||
page = requests.get(url, headers=self.headers) | ||
if page.text == "": | ||
return | ||
else: | ||
return page.text | ||
except: | ||
return |
Oops, something went wrong.