diff --git a/examples/only_grab.py b/examples/only_grab.py index 9df2faa..0deb12a 100644 --- a/examples/only_grab.py +++ b/examples/only_grab.py @@ -2,8 +2,11 @@ checking and save them to a file.""" import asyncio +import warnings +import logging from proxybroker import Broker +from proxybroker.providers import Provider, Blogspot_com, Spys_ru, Proxylist_me async def save(proxies, filename): @@ -12,20 +15,38 @@ async def save(proxies, filename): while True: proxy = await proxies.get() if proxy is None: + logging.info('got None from proxies queue') break - f.write('%s:%d\n' % (proxy.host, proxy.port)) + for proto in proxy.types or ['http', 'https']: + proto = proto.lower() + row = '%s://%s:%d\n' % (proto, proxy.host, proxy.port) + f.write(row) def main(): + providers = [ + # Blogspot_com(proto=('HTTP', 'HTTPS')), # noqa; 24800 + Provider( + url='https://geekelectronics.org/my-servisy/proxy', + proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'), + ), # 400 + Spys_ru(proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25')), # noqa; 660 + ] proxies = asyncio.Queue() broker = Broker(proxies) + # broker = Broker(proxies, providers=providers) tasks = asyncio.gather( - broker.grab(countries=['US', 'GB'], limit=10), + broker.grab(), save(proxies, filename='proxies.txt'), ) loop = asyncio.get_event_loop() + loop.set_debug(True) + loop.slow_callback_duration = 1 + # Report all mistakes managing asynchronous resources. + warnings.simplefilter('always', ResourceWarning) loop.run_until_complete(tasks) if __name__ == '__main__': + logging.basicConfig(level='INFO') main() diff --git a/proxybroker/api.py b/proxybroker/api.py index 9f59294..2a22ad0 100644 --- a/proxybroker/api.py +++ b/proxybroker/api.py @@ -327,7 +327,7 @@ def _get_tasks(by=MAX_CONCURRENT_PROVIDERS): proxies = await task for proxy in proxies: await self._handle(proxy, check=check) - log.debug('Grab cycle is complete') + log.info('Grab cycle is complete') if self._server: log.debug('fall asleep for %d seconds' % GRAB_PAUSE) await asyncio.sleep(GRAB_PAUSE) diff --git a/proxybroker/providers.py b/proxybroker/providers.py index 0b04485..81e52c5 100644 --- a/proxybroker/providers.py +++ b/proxybroker/providers.py @@ -1,4 +1,5 @@ import asyncio +import os import re import warnings from base64 import b64decode @@ -79,10 +80,7 @@ async def get_proxies(self): ) as self._session: await self._pipe() - log.debug( - '%d proxies received from %s: %s' - % (len(self.proxies), self.domain, self.proxies) - ) + log.info(f'{len(self.proxies)} proxies received from {self.domain}') return self.proxies async def _pipe(self): @@ -103,6 +101,8 @@ async def _find_on_pages(self, urls): async def _find_on_page(self, url, data=None, headers=None, method='GET'): page = await self.get(url, data=data, headers=headers, method=method) + if not page: + return oldcount = len(self.proxies) try: received = self.find_proxies(page) @@ -112,9 +112,12 @@ async def _find_on_page(self, url, data=None, headers=None, method='GET'): 'Error when executing find_proxies.' 'Domain: %s; Error: %r' % (self.domain, e) ) + if not received: + log.error(f'Got 0 proxies from {url}') + return self.proxies = received added = len(self.proxies) - oldcount - log.debug( + log.info( '%d(%d) proxies added(received) from %s' % (added, len(received), url) ) @@ -151,7 +154,7 @@ async def _get(self, url, data=None, headers=None, method='GET'): aiohttp.ServerDisconnectedError, ) as e: page = '' - log.debug('%s is failed. Error: %r;' % (url, e)) + log.info('%s is failed. Error: %r;' % (url, e)) return page def find_proxies(self, page): @@ -168,7 +171,7 @@ class Freeproxylists_com(Provider): async def _pipe(self): exp = r'''href\s*=\s*['"](?P[^'"]*)/(?P\d{10})[^'"]*['"]''' urls = [ - 'http://www.freeproxylists.com/socks.html', + # 'http://www.freeproxylists.com/socks.html', 'http://www.freeproxylists.com/elite.html', 'http://www.freeproxylists.com/anonymous.html', ] @@ -213,6 +216,8 @@ class Webanetlabs_net(Provider): async def _pipe(self): exp = r'''href\s*=\s*['"]([^'"]*proxylist_at_[^'"]*)['"]''' page = await self.get('https://webanetlabs.net/publ/24') + if not page: + return urls = [ 'https://webanetlabs.net%s' % path for path in re.findall(exp, page) ] @@ -225,6 +230,8 @@ class Checkerproxy_net(Provider): async def _pipe(self): exp = r'''href\s*=\s*['"](/archive/\d{4}-\d{2}-\d{2})['"]''' page = await self.get('https://checkerproxy.net/') + if not page: + return urls = [ 'https://checkerproxy.net/api%s' % path for path in re.findall(exp, page) @@ -244,6 +251,8 @@ async def _pipe(self): ) # noqa url = 'http://www.proxz.com/proxy_list_high_anonymous_0.html' page = await self.get(url) + if not page: + return urls = [ 'http://www.proxz.com/%s' % path for path in re.findall(exp, page) ] @@ -264,6 +273,8 @@ async def _pipe(self): exp = r'''href\s*=\s*['"]\./([^'"]?index\.php\?p=\d+[^'"]*)['"]''' url = 'http://proxy-list.org/english/index.php?p=1' page = await self.get(url) + if not page: + return urls = [ 'http://proxy-list.org/english/%s' % path for path in re.findall(exp, page) @@ -278,7 +289,7 @@ class Aliveproxy_com(Provider): async def _pipe(self): paths = [ - 'socks5-list', + # 'socks5-list', 'high-anonymity-proxy-list', 'anonymous-proxy-list', 'fastest-proxies', @@ -306,6 +317,8 @@ class Maxiproxies_com(Provider): async def _pipe(self): exp = r'''''' page = await self.get('http://maxiproxies.com/category/proxy-lists/') + if not page: + return urls = re.findall(exp, page) await self._find_on_pages(urls) @@ -316,6 +329,8 @@ class _50kproxies_com(Provider): async def _pipe(self): exp = r'''''' page = await self.get('http://50kproxies.com/category/proxy-list/') + if not page: + return urls = re.findall(exp, page) await self._find_on_pages(urls) @@ -326,6 +341,8 @@ class Proxylist_me(Provider): async def _pipe(self): exp = r'''href\s*=\s*['"][^'"]*/?page=(\d+)['"]''' page = await self.get('https://proxylist.me/') + if not page: + return lastId = max([int(n) for n in re.findall(exp, page)]) urls = ['https://proxylist.me/?page=%d' % n for n in range(lastId)] await self._find_on_pages(urls) @@ -503,6 +520,8 @@ class Proxynova_com(Provider): async def _pipe(self): expCountries = r'"([a-z]{2})"' page = await self.get('https://www.proxynova.com/proxy-server-list/') + if not page: + return tpl = 'https://www.proxynova.com/proxy-server-list/country-%s/' urls = [ tpl % isoCode @@ -548,6 +567,8 @@ async def _pipe(self): expSession = r"'([a-z0-9]{32})'" url = 'http://spys.one/proxies/' page = await self.get(url) + if not page: + return sessionId = re.findall(expSession, page)[0] data = { 'xf0': sessionId, # session id @@ -574,6 +595,8 @@ async def _pipe(self): exp = r'''href\s*=\s*['"]([^'"]?free-[^'"]*)['"]''' url = 'https://www.my-proxy.com/free-proxy-list.html' page = await self.get(url) + if not page: + return urls = [ 'https://www.my-proxy.com/%s' % path for path in re.findall(exp, page) @@ -670,7 +693,7 @@ class Proxylistplus_com(Provider): domain = 'list.proxylistplus.com' async def _pipe(self): - names = ['Fresh-HTTP-Proxy', 'SSL', 'Socks'] + names = ['Fresh-HTTP-Proxy'] # , 'SSL', 'Socks'] urls = [ 'http://list.proxylistplus.com/%s-List-%d' % (i, n) for i in names @@ -686,8 +709,8 @@ async def _pipe(self): urls = [ 'https://www.proxy-list.download/api/v1/get?type=http', 'https://www.proxy-list.download/api/v1/get?type=https', - 'https://www.proxy-list.download/api/v1/get?type=socks4', - 'https://www.proxy-list.download/api/v1/get?type=socks5', + # 'https://www.proxy-list.download/api/v1/get?type=socks4', + # 'https://www.proxy-list.download/api/v1/get?type=socks5', ] await self._find_on_pages(urls) @@ -731,10 +754,10 @@ def __init__(self, *args, **kwargs): proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'), ), # 200 Provider( - url='http://fineproxy.org/eng/fresh-proxies/', + url='https://t.me/s/proxiesfine', proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'), - ), # 5500 - Provider(url='https://socks-proxy.net/', proto=('SOCKS4', 'SOCKS5')), # 80 + ), # 4200 + # Provider(url='https://socks-proxy.net/', proto=('SOCKS4', 'SOCKS5')), # 80 Provider( url='http://www.httptunnel.ge/ProxyListForFree.aspx', proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25'), @@ -782,12 +805,12 @@ def __init__(self, *args, **kwargs): Blogspot_com( proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25') ), # noqa; 24800 - Gatherproxy_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 30 - Blogspot_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1486 - Tools_rosinstrument_com( - proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25') - ), # noqa; 4000 - Tools_rosinstrument_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1800 + # Gatherproxy_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 30 + # Blogspot_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1486 + # Tools_rosinstrument_com( + # proto=('HTTP', 'CONNECT:80', 'HTTPS', 'CONNECT:25') + # ), # noqa; 4000 + # Tools_rosinstrument_com_socks(proto=('SOCKS4', 'SOCKS5')), # noqa; 1800 My_proxy_com(max_conn=2), # noqa; 1000 Checkerproxy_net(), # noqa; 60000 Aliveproxy_com(), # noqa; 210