Skip to content

Commit

Permalink
use httpx
Browse files Browse the repository at this point in the history
  • Loading branch information
Lupino committed Sep 12, 2023
1 parent 5e5f4b8 commit df87d8f
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 51 deletions.
7 changes: 3 additions & 4 deletions grapy/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,9 @@ async def _process(self, req, events=[]):
rsp = await self.process_middleware('after_request', rsp)
await self.process_response(rsp, events)
finally:
if rsp.raw and getattr(rsp.raw, 'close'):
r = rsp.raw.close()
if asyncio.iscoroutine(r):
await r
r = rsp.close()
if asyncio.iscoroutine(r):
await r

async def process_middleware(self, name, obj):
for mid in self.middlewares:
Expand Down
8 changes: 7 additions & 1 deletion grapy/playwright_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,13 @@ def handler(res):
await self.custom_action(page)
content = await page.content()
logger.info(f'{method.upper()} {self.url} {status} {ct}')
return Response(page.url, bytes(content, 'utf-8'), page, status, ct, rsp.headers)
return Response(page.url,
bytes(content, 'utf-8'),
page,
status,
ct,
rsp.headers,
close=page.close)
finally:
self.request_time = time() - start_time

Expand Down
69 changes: 29 additions & 40 deletions grapy/request.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import aiohttp
import httpx
from urllib.parse import urljoin
from .response import Response
from .core import BaseRequest
from .core.exceptions import RetryRequest
import requests
from time import time
import logging
import anyio
Expand All @@ -18,47 +16,41 @@ class Request(BaseRequest):
the Request object
'''

async def _aio_request(self):
def _prepare_client(self, cls):
transport = getattr(self, 'transport', None)
proxies = getattr(self, 'proxy', None)
timeout = int(self.timeout)
return cls(transport=transport, timeout=timeout, proxies=proxies)

def _prepare_request(self, client):
method = self.method.lower()
kwargs = self.kwargs.copy()

connector = getattr(self, 'connector', None)

proxy = getattr(self, 'proxy', None)
if proxy:
kwargs['proxy'] = proxy

url = self.url
return client.request(method, url, **kwargs)

timeout = aiohttp.ClientTimeout(total=int(self.timeout))
async with aiohttp.ClientSession(connector=connector,
timeout=timeout) as client:
async with client.request(method, url, **kwargs) as rsp:
ct = rsp.headers.get('content-type', '')
status = rsp.status
rsp_url = urljoin(url, str(rsp.url))
spider = self.spider
logger.info(f'{method.upper()} {url} {status} {ct} {spider}')
content = await rsp.read()
return Response(rsp_url, content, rsp, status, ct, rsp.headers)

def _request(self):
url = self.url
def _parse_response(self, rsp):
method = self.method.lower()
kwargs = self.kwargs.copy()
proxy = getattr(self, 'proxy', None)
if proxy:
kwargs['proxies'] = {
'http': proxy,
'https': proxy,
}
func = getattr(requests, method)
rsp = func(url, timeout=int(self.timeout), **kwargs)
url = self.url

ct = rsp.headers.get('content-type', '')
status = rsp.status_code
logger.info(f'{method.upper()} {url} {status} {ct} {self.spider}')
rsp_url = urljoin(url, str(rsp.url))
return Response(rsp_url, rsp.content, rsp, status, ct, rsp.headers)
spider = self.spider
logger.info(f'{method.upper()} {url} {status} {ct} {spider}')
content = rsp.content
return Response(rsp_url, content, rsp, status, ct, rsp.headers)

async def _async_request(self):
self.sync = False
async with self._prepare_client(httpx.AsyncClient) as client:
rsp = await self._prepare_request(client)
return self._parse_response(rsp)

def _request(self):
self.sync = True
with self._prepare_client(httpx.Client) as client:
rsp = self._prepare_request(client)
return self._parse_response(rsp)

def set_cached(self, content, content_type):
self.cached = Response(self.url, content, None, 200, content_type, {})
Expand All @@ -80,10 +72,7 @@ async def request(self):
if self.sync:
return await anyio.to_thread.run_sync(self._request)

return await self._aio_request()
except aiohttp.client_exceptions.ClientError as e:
logger.error(f"OsConnectionError: {self.url} {e}")
raise RetryRequest()
return await self._async_request()
except Exception as exc:
cls = str(exc.__class__)[8:-2]
logger.error(cls + str(exc) + ': ' + self.url)
Expand Down
16 changes: 14 additions & 2 deletions grapy/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,17 @@ class Response(object):

__slots__ = [
'url', 'raw', 'encoding', 'content', '_soup', '_pdf', 'req', 'headers',
'status', 'content_type'
'status', 'content_type', '_close'
]

def __init__(self, url, content, raw, status, content_type, headers={}):
def __init__(self,
url,
content,
raw,
status,
content_type,
headers={},
close=None):
self.raw = raw
self.url = url
self._soup = None
Expand All @@ -51,6 +58,7 @@ def __init__(self, url, content, raw, status, content_type, headers={}):
self.headers = headers
self.status = status
self.content_type = content_type
self._close = close

@property
def text(self):
Expand Down Expand Up @@ -167,3 +175,7 @@ def pdf(self):
content = self.content
self._pdf = pdfplumber.open(BytesIO(content))
return self._pdf

def close(self):
if self._close is not None:
return self._close()
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ classifiers = [
"Programming Language :: Python :: 3",
]
dependencies = [
'asyncio', 'aiohttp', 'beautifulsoup4', 'requests', 'asyncio-pool',
'bloom-filter2'
'asyncio', 'beautifulsoup4', 'asyncio-pool', 'bloom-filter2', 'httpx[socks]'
]

[tool.setuptools]
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
]

requires = [
'asyncio', 'aiohttp', 'beautifulsoup4', 'requests', 'asyncio-pool',
'bloom-filter2'
'asyncio', 'beautifulsoup4', 'asyncio-pool', 'bloom-filter2', 'httpx[socks]'
]

setup(
Expand Down

0 comments on commit df87d8f

Please sign in to comment.