This repository has been archived by the owner on Jan 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
82 lines (67 loc) · 2.76 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''
Author: Yaaprogrammer
Date: 2022-04-15 18:59:49
LastEditors: Yaaprogrammer
LastEditTime: 2022-04-22 20:03:36
Copyright (c) 2022 by Yaaprogrammer, All Rights Reserved.
'''
import random
from time import sleep
import regex as re
import requests
from loguru import logger
from tenacity import retry, retry_if_exception_type, stop_after_attempt
from utils import Configuration
class Crawler:
retryTimes = Configuration().getProperty("crawler.retry.times")
def __init__(self) -> None:
self.__session = requests.Session()
self.__response = requests.Response()
self.__config = Configuration()
@property
def response(self):
return self.__response.content.decode('utf-8')
@retry(retry=retry_if_exception_type(requests.exceptions.RequestException),
stop=stop_after_attempt(retryTimes))
def get(self, url: str) -> None:
self.__randomDelay()
verify = self.__whetherVerify()
timeout = self.__getTimeOut()
headers = self.__getHeaders()
self.__response = self.__session.get(url=url,
verify=verify,
timeout=timeout,
headers=headers)
logger.info(f"Get: {url}")
@retry(retry=retry_if_exception_type(requests.exceptions.RequestException),
stop=stop_after_attempt(retryTimes))
def post(self, url: str, data: dict) -> None:
self.__randomDelay()
verify = self.__whetherVerify()
timeout = self.__getTimeOut()
headers = self.__getHeaders()
self.__response = self.__session.post(url=url,
data=data,
verify=verify,
timeout=timeout,
headers=headers)
logger.info(f"Post: {url}")
def __getHeaders(self):
return self.__config.getProperty('crawler.headers')
def __getTimeOut(self) -> int:
return self.__config.getProperty("crawler.timeout")
def __whetherVerify(self) -> bool:
return self.__config.getProperty("crawler.verify")
def __randomDelay(self) -> None:
min = self.__config.getProperty("crawler.delay.min")
max = self.__config.getProperty("crawler.delay.max")
delay = random.randint(min, max)
logger.info(f"Random delay: {delay}")
sleep(delay)
def regSearchFromResponse(self, pattern: str) -> str:
return re.search(
pattern,
self.__response.content.decode('utf-8'),
).group()
def existInResponse(self, searchFor: str) -> bool:
return self.__response.content.decode('utf-8').find(searchFor) != -1