From dbd7ba8163414a9a87e6b369f41bf5686a0f799e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 6 Feb 2017 14:42:12 +0100 Subject: [PATCH 001/273] getting back redirect_urls from Scrapy --- frontera/contrib/scrapy/converters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/scrapy/converters.py b/frontera/contrib/scrapy/converters.py index f569024c7..fc013150d 100644 --- a/frontera/contrib/scrapy/converters.py +++ b/frontera/contrib/scrapy/converters.py @@ -40,8 +40,8 @@ def to_frontier(self, scrapy_request): b'scrapy_meta': scrapy_meta, b'origin_is_frontier': True, }) - if b'redirect_urls' in scrapy_meta: - meta[b'redirect_urls'] = scrapy_meta[b'redirect_urls'] + if 'redirect_urls' in scrapy_meta: + meta[b'redirect_urls'] = scrapy_meta['redirect_urls'] return FrontierRequest(url=scrapy_request.url, method=scrapy_request.method, headers=scrapy_request.headers, @@ -81,8 +81,8 @@ def to_frontier(self, scrapy_response): """response: Scrapy > Frontier""" frontier_request = scrapy_response.meta[b'frontier_request'] frontier_request.meta[b'scrapy_meta'] = scrapy_response.meta - if b'redirect_urls' in scrapy_response.meta: - frontier_request.meta[b'redirect_urls'] = scrapy_response.meta[b'redirect_urls'] + if 'redirect_urls' in scrapy_response.meta: + frontier_request.meta[b'redirect_urls'] = scrapy_response.meta['redirect_urls'] del scrapy_response.meta[b'frontier_request'] return FrontierResponse(url=scrapy_response.url, status_code=scrapy_response.status, From dfe8a527119a8b407760650167dcf8ecec2eec7f Mon Sep 17 00:00:00 2001 From: voith Date: Mon, 6 Feb 2017 23:49:59 +0530 Subject: [PATCH 002/273] added test to check codec when send_body=False --- tests/test_codecs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 7e2aa55f6..b887e24a9 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -7,18 +7,19 @@ import pytest +@pytest.mark.parametrize('send_body', [True, False]) @pytest.mark.parametrize( ('encoder', 'decoder'), [ (MsgPackEncoder, MsgPackDecoder), (JsonEncoder, JsonDecoder) ] ) -def test_codec(encoder, decoder): +def test_codec(encoder, decoder, send_body): def check_request(req1, req2): assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \ and req1.method == req2.method - enc = encoder(Request, send_body=True) + enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru",method=b'GET', meta={b"test": b"shmest"}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") @@ -46,7 +47,11 @@ def check_request(req1, req2): o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response - assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[1].meta == req.meta + assert o[1].url == req.url and o[1].meta == req.meta + if send_body: + o[1].body == b'SOME CONTENT' + else: + o[1].body is None o = dec.decode(next(it)) print(o) From b715ba332c2debe182972f550fbe36171cfaec7c Mon Sep 17 00:00:00 2001 From: voith Date: Tue, 7 Feb 2017 00:21:20 +0530 Subject: [PATCH 003/273] fixed json codec when send_body=False --- frontera/contrib/backends/remote/codecs/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index 8c7987bef..7df267894 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -106,7 +106,7 @@ def _response_from_object(self, obj): meta=obj[b'meta']) return self._response_model(url=url, status_code=obj[b'status_code'], - body=b64decode(obj[b'body']), + body=b64decode(obj[b'body']) if obj[b'body'] is not None else None, request=request) def _request_from_object(self, obj): From cb74848822679ba2bbfc7873a1c6cc9f804a52ef Mon Sep 17 00:00:00 2001 From: voith Date: Sat, 10 Dec 2016 22:03:21 +0530 Subject: [PATCH 004/273] fixed msgpack codec --- .../contrib/backends/remote/codecs/msgpack.py | 43 ++++++------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 701f61732..9ee99c039 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -6,31 +6,11 @@ from msgpack import packb, unpackb from frontera.core.codec import BaseDecoder, BaseEncoder -import six from w3lib.util import to_native_str def _prepare_request_message(request): - def serialize(obj): - """Recursively walk object's hierarchy.""" - if isinstance(obj, six.text_type): - return obj.encode('utf8') - if isinstance(obj, (bool, six.integer_types, float, six.binary_type)): - return obj - elif isinstance(obj, dict): - obj = obj.copy() - for key in obj: - obj[key] = serialize(obj[key]) - return obj - elif isinstance(obj, list): - return [serialize(item) for item in obj] - elif isinstance(obj, tuple): - return tuple(serialize([item for item in obj])) - elif hasattr(obj, '__dict__'): - return serialize(obj.__dict__) - else: - return None - return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] + return [request.url, request.method, request.headers, request.cookies, request.meta] def _prepare_response_message(response, send_body): @@ -42,28 +22,29 @@ def __init__(self, request_model, *a, **kw): self.send_body = True if 'send_body' in kw and kw['send_body'] else False def encode_add_seeds(self, seeds): - return packb([b'as', [_prepare_request_message(seed) for seed in seeds]]) + return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True, encoding="utf-8") def encode_page_crawled(self, response): - return packb([b'pc', _prepare_response_message(response, self.send_body)]) + return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True, encoding="utf-8") def encode_links_extracted(self, request, links): - return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]]) + return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]], + use_bin_type=True, encoding="utf-8") def encode_request_error(self, request, error): - return packb([b're', _prepare_request_message(request), str(error)]) + return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True, encoding="utf-8") def encode_request(self, request): - return packb(_prepare_request_message(request)) + return packb(_prepare_request_message(request), use_bin_type=True, encoding="utf-8") def encode_update_score(self, request, score, schedule): - return packb([b'us', _prepare_request_message(request), score, schedule]) + return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True, encoding="utf-8") def encode_new_job_id(self, job_id): - return packb([b'njid', int(job_id)]) + return packb([b'njid', int(job_id)], use_bin_type=True, encoding="utf-8") def encode_offset(self, partition_id, offset): - return packb([b'of', int(partition_id), int(offset)]) + return packb([b'of', int(partition_id), int(offset)], use_bin_type=True, encoding="utf-8") class Decoder(BaseDecoder): @@ -87,7 +68,7 @@ def _request_from_object(self, obj): meta=obj[4]) def decode(self, buffer): - obj = unpackb(buffer) + obj = unpackb(buffer, encoding="utf-8") if obj[0] == b'pc': return ('page_crawled', self._response_from_object(obj[1])) @@ -108,4 +89,4 @@ def decode(self, buffer): return TypeError('Unknown message type') def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer)) + return self._request_from_object(unpackb(buffer, encoding="utf-8")) From 9c1b87397fe28f03241f39708474ef8ab46dd0f0 Mon Sep 17 00:00:00 2001 From: voith Date: Mon, 12 Dec 2016 23:01:22 +0530 Subject: [PATCH 005/273] fixed json codec added test for testing encoded unicode values in meta using msgpack codec fixed message_bus_backend_test added tests for _convert, _reconvert make msgpack requirement >=0.4 made suggested changes added warning message for non serializable objects and added an early return renamed method names for encode-decode to convert-revert and added docstring to these methods renamed methdod _revert_from_saved_type to _convert_from_saved_type replaced revert with restores in a doc string added assertion to check object length in _convert_from_saved_type resolved merge conflicts --- .../contrib/backends/remote/codecs/json.py | 135 ++++++++++++------ .../contrib/backends/remote/codecs/msgpack.py | 46 ++++-- requirements/tests.txt | 2 +- setup.py | 2 +- tests/test_codecs.py | 76 +++++++++- tests/test_message_bus_backend.py | 2 +- 6 files changed, 201 insertions(+), 62 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index 7df267894..ef4aa538d 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -3,18 +3,64 @@ """ from __future__ import absolute_import import json +import six from base64 import b64decode, b64encode from frontera.core.codec import BaseDecoder, BaseEncoder -from w3lib.util import to_unicode, to_native_str -from frontera.utils.misc import dict_to_unicode, dict_to_bytes +from w3lib.util import to_unicode, to_bytes + + +def _convert_and_save_type(obj): + """ + :param obj: dict object + + The purpose of this method is to transform the given dict + into a form that would be able to serialize with JSONEncoder. + In order to implement this, this method converts all byte strings + inside a dict to unicode and saves their type for reverting to its + original state. The type and the value are stored as a tuple in the + following format: (original_type, converted value). All other objects + like dict, tuple, list are converted to the same format for the sake + of serialization and for the ease of reverting. + Refer `https://github.com/scrapinghub/frontera/pull/233#discussion_r97432868` + for the detailed explanation about the design. + """ + if isinstance(obj, bytes): + return 'bytes', to_unicode(obj) + elif isinstance(obj, dict): + return 'dict', [(_convert_and_save_type(k), _convert_and_save_type(v)) for k, v in six.iteritems(obj)] + elif isinstance(obj, (list, tuple)): + return type(obj).__name__, [_convert_and_save_type(item) for item in obj] + return 'other', obj + + +def _convert_from_saved_type(obj): + """ + :param obj: object returned by `_convert_and_save_type` + + Restores the original state of the object converted + earlier by `_convert_and_save_type`. This method considers every + first element of the nested tuple as the original type information and + the second value to be the converted value. It applies the original type + recursively on the object to retrieve the original form of the object. + """ + assert len(obj) == 2 + obj_type, obj_value = obj + if obj_type == 'bytes': + return to_bytes(obj_value) + elif obj_type == 'dict': + return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value]) + elif obj_type in ['list', 'tuple']: + _type = list if obj_type == 'list' else tuple + return _type([_convert_from_saved_type(item) for item in obj_value]) + return obj_value def _prepare_request_message(request): - return {'url': to_unicode(request.url), - 'method': to_unicode(request.method), - 'headers': dict_to_unicode(request.headers), - 'cookies': dict_to_unicode(request.cookies), - 'meta': dict_to_unicode(request.meta)} + return {'url': request.url, + 'method': request.method, + 'headers': request.headers, + 'cookies': request.cookies, + 'meta': request.meta} def _prepare_links_message(links): @@ -22,10 +68,10 @@ def _prepare_links_message(links): def _prepare_response_message(response, send_body): - return {'url': to_unicode(response.url), + return {'url': response.url, 'status_code': response.status_code, - 'meta': dict_to_unicode(response.meta), - 'body': to_unicode(b64encode(response.body)) if send_body else None} + 'meta': response.meta, + 'body': b64encode(response.body) if send_body else None} class CrawlFrontierJSONEncoder(json.JSONEncoder): @@ -45,6 +91,10 @@ def __init__(self, request_model, *a, **kw): self.send_body = kw.pop('send_body', False) super(Encoder, self).__init__(request_model, *a, **kw) + def encode(self, obj): + encoded = _convert_and_save_type(obj) + return super(Encoder, self).encode(encoded) + def encode_add_seeds(self, seeds): return self.encode({ 'type': 'add_seeds', @@ -101,52 +151,51 @@ def __init__(self, request_model, response_model, *a, **kw): super(Decoder, self).__init__(*a, **kw) def _response_from_object(self, obj): - url = to_native_str(obj[b'url']) + url = obj['url'] request = self._request_model(url=url, - meta=obj[b'meta']) + meta=obj['meta']) return self._response_model(url=url, - status_code=obj[b'status_code'], - body=b64decode(obj[b'body']) if obj[b'body'] is not None else None, + status_code=obj['status_code'], + body=b64decode(obj['body']) if obj['body'] is not None else None, request=request) def _request_from_object(self, obj): - return self._request_model(url=to_native_str(obj[b'url']), - method=obj[b'method'], - headers=obj[b'headers'], - cookies=obj[b'cookies'], - meta=obj[b'meta']) + return self._request_model(url=obj['url'], + method=obj['method'], + headers=obj['headers'], + cookies=obj['cookies'], + meta=obj['meta']) def decode(self, message): - message = dict_to_bytes(super(Decoder, self).decode(message)) - if message[b'type'] == b'links_extracted': - request = self._request_from_object(message[b'r']) - links = [self._request_from_object(link) for link in message[b'links']] + message = _convert_from_saved_type(super(Decoder, self).decode(message)) + if message['type'] == 'links_extracted': + request = self._request_from_object(message['r']) + links = [self._request_from_object(link) for link in message['links']] return ('links_extracted', request, links) - if message[b'type'] == b'page_crawled': - response = self._response_from_object(message[b'r']) + if message['type'] == 'page_crawled': + response = self._response_from_object(message['r']) return ('page_crawled', response) - if message[b'type'] == b'request_error': - request = self._request_from_object(message[b'r']) - return ('request_error', request, to_native_str(message[b'error'])) - if message[b'type'] == b'update_score': - return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule']) - if message[b'type'] == b'add_seeds': + if message['type'] == 'request_error': + request = self._request_from_object(message['r']) + return ('request_error', request, message['error']) + if message['type'] == 'update_score': + return ('update_score', self._request_from_object(message['r']), message['score'], message['schedule']) + if message['type'] == 'add_seeds': seeds = [] - for seed in message[b'seeds']: + for seed in message['seeds']: request = self._request_from_object(seed) seeds.append(request) return ('add_seeds', seeds) - if message[b'type'] == b'new_job_id': - return ('new_job_id', int(message[b'job_id'])) - if message[b'type'] == b'offset': - return ('offset', int(message[b'partition_id']), int(message[b'offset'])) + if message['type'] == 'new_job_id': + return ('new_job_id', int(message['job_id'])) + if message['type'] == 'offset': + return ('offset', int(message['partition_id']), int(message['offset'])) return TypeError('Unknown message type') def decode_request(self, message): - obj = dict_to_bytes(super(Decoder, self).decode(message)) - return self._request_model(url=to_native_str(obj[b'url']), - method=obj[b'method'], - headers=obj[b'headers'], - cookies=obj[b'cookies'], - meta=obj[b'meta']) - + obj = _convert_from_saved_type(super(Decoder, self).decode(message)) + return self._request_model(url=obj['url'], + method=obj['method'], + headers=obj['headers'], + cookies=obj['cookies'], + meta=obj['meta']) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 9ee99c039..6be589dae 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -2,15 +2,37 @@ """ A MsgPack codec for Frontera. Implemented using native msgpack-python library. """ from __future__ import absolute_import - +import logging from msgpack import packb, unpackb from frontera.core.codec import BaseDecoder, BaseEncoder +import six from w3lib.util import to_native_str +logger = logging.getLogger(__name__) + + def _prepare_request_message(request): - return [request.url, request.method, request.headers, request.cookies, request.meta] + def serialize(obj): + """Recursively walk object's hierarchy.""" + if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)): + return obj + elif isinstance(obj, dict): + obj = obj.copy() + for key in obj: + obj[key] = serialize(obj[key]) + return obj + elif isinstance(obj, list): + return [serialize(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(serialize([item for item in obj])) + elif hasattr(obj, '__dict__'): + return serialize(obj.__dict__) + else: + logger.warning('unable to serialize object: {}'.format(obj)) + return None + return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] def _prepare_response_message(response, send_body): @@ -22,29 +44,29 @@ def __init__(self, request_model, *a, **kw): self.send_body = True if 'send_body' in kw and kw['send_body'] else False def encode_add_seeds(self, seeds): - return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True, encoding="utf-8") + return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True) def encode_page_crawled(self, response): - return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True, encoding="utf-8") + return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True) def encode_links_extracted(self, request, links): return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]], - use_bin_type=True, encoding="utf-8") + use_bin_type=True) def encode_request_error(self, request, error): - return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True, encoding="utf-8") + return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True) def encode_request(self, request): - return packb(_prepare_request_message(request), use_bin_type=True, encoding="utf-8") + return packb(_prepare_request_message(request), use_bin_type=True) def encode_update_score(self, request, score, schedule): - return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True, encoding="utf-8") + return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True) def encode_new_job_id(self, job_id): - return packb([b'njid', int(job_id)], use_bin_type=True, encoding="utf-8") + return packb([b'njid', int(job_id)], use_bin_type=True) def encode_offset(self, partition_id, offset): - return packb([b'of', int(partition_id), int(offset)], use_bin_type=True, encoding="utf-8") + return packb([b'of', int(partition_id), int(offset)], use_bin_type=True) class Decoder(BaseDecoder): @@ -68,7 +90,7 @@ def _request_from_object(self, obj): meta=obj[4]) def decode(self, buffer): - obj = unpackb(buffer, encoding="utf-8") + obj = unpackb(buffer, encoding='utf-8') if obj[0] == b'pc': return ('page_crawled', self._response_from_object(obj[1])) @@ -89,4 +111,4 @@ def decode(self, buffer): return TypeError('Unknown message type') def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer, encoding="utf-8")) + return self._request_from_object(unpackb(buffer, encoding='utf-8')) diff --git a/requirements/tests.txt b/requirements/tests.txt index 0ac170f54..455cd0c35 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -6,7 +6,7 @@ scrapy>=0.24 SQLAlchemy>=1.0.0 cachetools pyzmq -msgpack-python +msgpack-python>=0.4 kafka-python>=1.0.0 pytest-cov happybase>=1.0.0 diff --git a/setup.py b/setup.py index e498c97b0..5f305a258 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ ], 'zeromq': [ 'pyzmq', - 'msgpack-python' + 'msgpack-python>=0.4' ], 'kafka': [ 'kafka-python>=1.0.0' diff --git a/tests/test_codecs.py b/tests/test_codecs.py index b887e24a9..82136f14b 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,12 +1,44 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from frontera.contrib.backends.remote.codecs.json import Encoder as JsonEncoder, Decoder as JsonDecoder +import json +import unittest +from frontera.contrib.backends.remote.codecs.json import (Encoder as JsonEncoder, Decoder as JsonDecoder, + _convert_and_save_type, _convert_from_saved_type) from frontera.contrib.backends.remote.codecs.msgpack import Encoder as MsgPackEncoder, Decoder as MsgPackDecoder from frontera.core.models import Request, Response import pytest +def _compare_dicts(dict1, dict2): + """ + Compares two dicts + :return: True if both dicts are equal else False + """ + if dict1 == None or dict2 == None: + return False + + if type(dict1) is not dict or type(dict2) is not dict: + return False + + shared_keys = set(dict2.keys()) & set(dict2.keys()) + + if not (len(shared_keys) == len(dict1.keys()) and len(shared_keys) == len(dict2.keys())): + return False + + dicts_are_equal = True + for key in dict1.keys(): + if type(dict1[key]) is dict: + dicts_are_equal = _compare_dicts(dict1[key], dict2[key]) + else: + dicts_are_equal = (dict1[key] == dict2[key]) and (type(dict1[key]) == type(dict2[key])) + + if not dicts_are_equal: + return False + + return dicts_are_equal + + @pytest.mark.parametrize('send_body', [True, False]) @pytest.mark.parametrize( ('encoder', 'decoder'), [ @@ -16,12 +48,13 @@ ) def test_codec(encoder, decoder, send_body): def check_request(req1, req2): - assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \ - and req1.method == req2.method + assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ + _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) - req = Request(url="http://www.yandex.ru",method=b'GET', meta={b"test": b"shmest"}, headers={b'reqhdr': b'value'}) + req = Request(url="http://www.yandex.ru", method=b'GET', + meta={b'test': b'shmest', b'scrapy_meta': {'rule': 0, 'key': 'value'}}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), @@ -85,3 +118,38 @@ def check_request(req1, req2): o = dec.decode_request(next(it)) check_request(o, req) + + +class TestEncodeDecodeJson(unittest.TestCase): + """ + Test for testing methods `_encode_recursively` and `_decode_recursively` used in json codec + """ + + def test_encode_decode_json_recursively(self): + _int = 1 + _bytes = b'bytes' + _unicode = u'unicode' + _bool = True + _none = None + simple_dict = {'key': 'value'} + simple_list = ['item', 1] + simple_tuple = ('str', 2) + mixed_type_dict = {b'k1': 'v1', 'k2': b'v2', 'int': 1, b'none': None, 'bool': False} + mixed_type_list = [b'i1', 'i2', 23, None, True] + mixed_type_tuple = [b'i1', 'i2', 23, None, True] + nested_dict = {'k1': b'v1', 'lst': [b'i1', 1, ('str', 1, {'k2': b'v1', 'tup': (1, None)})]} + nested_list = [True, None, (1, 2, 3), {b'k1': b'v1', 'tup': ('a', b'b', [None, False])}] + nested_tuple = (1, None, ['a', 'b', True, {b'k1': 'v2', 'lst': ['a', False, (2, 3, 5)]}]) + msgs = [_int, _bytes, _unicode, _bool, _none, simple_dict, simple_list, simple_tuple, + mixed_type_dict, mixed_type_list, mixed_type_tuple, nested_dict, nested_list, nested_tuple] + encoder = json.JSONEncoder() + decoder = json.JSONDecoder() + for original_msg in msgs: + encoded_msg_1 = _convert_and_save_type(original_msg) + encoded_msg_2 = encoder.encode(encoded_msg_1) + decoded_msg_2 = decoder.decode(encoded_msg_2) + decoded_msg_1 = _convert_from_saved_type(decoded_msg_2) + if isinstance(decoded_msg_1, dict): + self.assertDictEqual(decoded_msg_1, original_msg) + elif isinstance(decoded_msg_1, (list, tuple)): + self.assertSequenceEqual(decoded_msg_1, original_msg) diff --git a/tests/test_message_bus_backend.py b/tests/test_message_bus_backend.py index d4753c52e..68278d133 100644 --- a/tests/test_message_bus_backend.py +++ b/tests/test_message_bus_backend.py @@ -49,7 +49,7 @@ def test_page_crawled(self): resp = Response(r1.url, body='body', request=r1) mbb.page_crawled(resp) page = mbb._decoder.decode(mbb.spider_log_producer.messages[0])[1] - self.assertEqual((page.request.url, page.body), (resp.request.url, b'body')) + self.assertEqual((page.request.url, page.body), (resp.request.url, 'body')) def test_links_extracted(self): mbb = self.mbb_setup() From 1129a254c06927a1f4c24d9b9d226031a5ed8554 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 9 Feb 2017 15:19:06 +0100 Subject: [PATCH 006/273] bump version number -> 0.7.1 --- docs/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e35c09e2b..e720fc4b1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '0.6' +version = '0.7' # The full version, including alpha/beta/rc tags. -release = '0.6.0' +release = '0.7.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 64c033422fd19dfd6a01b28986895b0a0988ebd1 Mon Sep 17 00:00:00 2001 From: voith Date: Wed, 8 Feb 2017 12:11:08 +0530 Subject: [PATCH 007/273] added tests to integrate scrapy midleware with frontera --- tests/test_scrapy.py | 91 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 3 deletions(-) diff --git a/tests/test_scrapy.py b/tests/test_scrapy.py index e29608001..a08d35e05 100644 --- a/tests/test_scrapy.py +++ b/tests/test_scrapy.py @@ -1,12 +1,23 @@ # -*- coding: utf-8 -*- - from __future__ import absolute_import -from frontera.contrib.scrapy.converters import RequestConverter, ResponseConverter + +import sys + +from scrapy.core.spidermw import SpiderMiddlewareManager +from scrapy.http import Request, Response from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse -from frontera.core.models import Request as FrontierRequest +from scrapy.spiders import Spider +from scrapy.utils.test import get_crawler +from twisted.internet.defer import Deferred +from twisted.trial import unittest from w3lib.util import to_bytes +from frontera.contrib.scrapy.converters import (RequestConverter, + ResponseConverter) +from frontera.core.models import Request as FrontierRequest +from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler + class TestSpider(object): def callback(self): @@ -75,3 +86,77 @@ def test_request_response_converters(): frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url + + +class TestFronteraMiddlewaresWithScrapy(unittest.TestCase): + + def setUp(self): + class TestSpider(Spider): + name = 'test' + + self.spider = TestSpider + scrapy_default_middlewares = { + 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 + } + + # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware + sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares + + custom_settings = { + 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} + } + crawler = get_crawler(self.spider, custom_settings) + self.add_frontera_scheduler(crawler) + self.smw = SpiderMiddlewareManager.from_crawler(crawler) + + @staticmethod + def add_frontera_scheduler(crawler): + scheduler = FronteraScheduler(crawler) + + # mock these functions + scheduler.frontier.page_crawled = lambda x: x + scheduler.frontier.links_extracted = lambda x, y: x + scheduler.stats_manager.add_crawled_page = lambda x, y: x + + class Engine(object): + def __init__(self, scheduler): + self.slot = type('slot', (object,), {}) + self.slot.scheduler = scheduler + + crawler.engine = Engine(scheduler) + + def test_frontera_scheduler_spider_middleware_with_referer_middleware(self): + + def request_callback(response): + yield Request('http://frontera.org') + + req = Request( + url='http://www.scrapy.org', + callback=request_callback, + meta={b'frontier_request': FrontierRequest('http://www.scrapy.org')} + ) + + res = Response(url='http://www.scrapy.org', request=req) + + def call_request_callback(result, request, spider): + dfd = Deferred() + dfd.addCallback(request.callback) + return dfd + + def test_middleware_output(result): + out = list(result) + self.assertEquals(len(out), 1) + self.assertIsInstance(out[0], Request) + self.assertIn('Referer', out[0].headers) + self.assertEquals(out[0].headers['Referer'], to_bytes(res.url)) + + def test_failure(failure): + # work around for test to fail with detailed traceback + self._observer._errors.append(failure) + + dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) + + dfd.addCallback(test_middleware_output) + dfd.addErrback(test_failure) + + dfd.callback(res) From 57de8765afea23a36665748fcfbf2840e85b8770 Mon Sep 17 00:00:00 2001 From: voith Date: Sun, 12 Feb 2017 14:50:42 +0530 Subject: [PATCH 008/273] fixed schedulers process_spider_output() to yield requests --- frontera/contrib/scrapy/schedulers/frontier.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index f83f08cfa..782a64c23 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -110,8 +110,7 @@ def process_spider_output(self, response, result, spider): for element in result: if isinstance(element, Request): links.append(element) - else: - yield element + yield element frontier_request = response.meta[b'frontier_request'] self.frontier.page_crawled(response) # removed frontier part from .meta # putting it back, to persist .meta from original request From 3df3244f46151920b25ca615eb1da9bd0b00c230 Mon Sep 17 00:00:00 2001 From: voith Date: Sun, 12 Feb 2017 15:06:37 +0530 Subject: [PATCH 009/273] fixed test test_process_spider_output in est_frontera_scheduler.py --- tests/test_frontera_scheduler.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index fe1b7a50a..3b6e1ed04 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -113,12 +113,18 @@ def test_next_request_overused_keys_info(self): def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} + no_requests = 3 result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) - assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i['item'])) == \ + out = list(fs.process_spider_output(resp, result, Spider)) + assert len(out) == len(result) + out_request = out[:no_requests] + assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests]) + out_items = out[no_requests:] + assert sorted(out_items, key=lambda i: sorted(i['item'])) == \ sorted([i1, i2], key=lambda i: sorted(i['item'])) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url From 08772e4055a27be54bd7040d1ef4c9639a543f8b Mon Sep 17 00:00:00 2001 From: voith Date: Mon, 13 Feb 2017 23:52:15 +0530 Subject: [PATCH 010/273] fixed a path in docs --- docs/source/topics/scrapy-integration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/topics/scrapy-integration.rst b/docs/source/topics/scrapy-integration.rst index cbe196b6c..56422ea70 100644 --- a/docs/source/topics/scrapy-integration.rst +++ b/docs/source/topics/scrapy-integration.rst @@ -177,7 +177,7 @@ Activating a Seed loader Just add the Seed Loader middleware to the ``SPIDER_MIDDLEWARES`` scrapy settings:: SPIDER_MIDDLEWARES.update({ - 'frontera.contrib.scrapy.middlewares.seeds.FileSeedLoader': 650 + 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 650 }) From 39a29c0aeefb88d1b0a936922c424a507488cc3b Mon Sep 17 00:00:00 2001 From: xsren Date: Tue, 14 Feb 2017 14:56:53 +0800 Subject: [PATCH 011/273] Update message_bus.rst --- docs/source/topics/message_bus.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/topics/message_bus.rst b/docs/source/topics/message_bus.rst index b5cdfb7d3..6f67af0c0 100644 --- a/docs/source/topics/message_bus.rst +++ b/docs/source/topics/message_bus.rst @@ -2,7 +2,7 @@ Message bus =========== -Message bus ss the transport layer abstraction mechanism. Frontera provides interface and several implementations. +Message bus is the transport layer abstraction mechanism. Frontera provides interface and several implementations. Only one message bus can be used in crawler at the time, and it's selected with :setting:`MESSAGE_BUS` setting. Spiders process can use @@ -97,4 +97,4 @@ JSON Module: frontera.contrib.backends.remote.codecs.json -.. _msgpack: http://msgpack.org/index.html \ No newline at end of file +.. _msgpack: http://msgpack.org/index.html From 046430a5bb40bf4217767207f2b40c79a2b83618 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Thu, 16 Feb 2017 09:16:42 +0100 Subject: [PATCH 012/273] Feature Redis Backend for Frontera Add Redis backend modelled after the HBase backend. This Backend is meant for use as a metadata store for frontera. Actual pages crawled are assumed to be stored elsewhere using i.e. a pipeline. --- .../backends/redis_backend/__init__.py | 387 ++++++++++++++++++ .../backends/redis_backend/test_redis.py | 375 +++++++++++++++++ 2 files changed, 762 insertions(+) create mode 100644 frontera/contrib/backends/redis_backend/__init__.py create mode 100644 tests/contrib/backends/redis_backend/test_redis.py diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py new file mode 100644 index 000000000..2f095942c --- /dev/null +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -0,0 +1,387 @@ +# -*- coding: utf-8 -*- +from collections import Iterable +from datetime import datetime +from frontera.utils.url import parse_domain_from_url_fast +from frontera import DistributedBackend +from frontera.core.components import Metadata, Queue, States +from frontera.core.models import Request +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.utils.misc import get_crc32 +from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder +import logging +from msgpack import packb, unpackb +from redis import ConnectionPool, StrictRedis +from time import time + +FIELD_CRAWL_AT = b'crawl_at' +FIELD_CREATED_AT = b'created_at' +FIELD_DEPTH = b'depth' +FIELD_DOMAIN = b'domain' +FIELD_DOMAIN_FINGERPRINT = b'domain_fingerprint' +FIELD_ERROR = b'error' +FIELD_FINGERPRINT = b'fingerprint' +FIELD_NAME = b'name' +FIELD_SCORE = b'score' +FIELD_STATE = b'state' +FIELD_STATUS_CODE = b'status_code' +FIELD_URL = b'url' + +logging.getLogger('boto3.resources.action').setLevel(logging.WARNING) + + +class RedisQueue(Queue): + MAX_SCORE = 1.0 + MIN_SCORE = 0.0 + SCORE_STEP = 0.01 + + def __init__(self, pool, partitions, delete_all_keys=False): + self._pool = pool + self._partitions = [i for i in range(0, partitions)] + self._partitioner = Crc32NamePartitioner(self._partitions) + self._logger = logging.getLogger("redis.queue") + + if delete_all_keys: + connection = StrictRedis(connection_pool=self._pool) + connection.flushdb() + + class DumbResponse: + pass + + self._decoder = Decoder(Request, DumbResponse) + self._encoder = Encoder(Request) + + def get_next_requests(self, max_n_requests, partition_id, **kwargs): + """ + Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all + parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue. + :param max_n_requests: maximum number of requests + :param partition_id: partition id to get batch from + :return: list of :class:`Request ` objects. + """ + min_requests = kwargs.pop('min_requests') + max_requests_per_host = kwargs.pop('max_requests_per_host') + assert (max_n_requests >= min_requests) + connection = StrictRedis(connection_pool=self._pool) + queue = {} + count = 0 + now_ts = int(time()) + max_host_items = 0 + to_remove = [] + for data in connection.zrevrange(partition_id, start=0, end=max_n_requests): + item = unpackb(data, use_list=False) + timestamp, fprint, host_crc32, _, score = item + if timestamp > now_ts: + continue + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + continue + queue[host_crc32].append(item) + if len(queue[host_crc32]) > max_host_items: + max_host_items = len(queue[host_crc32]) + count += 1 + to_remove.append(data) + + if count >= max_n_requests: + break + + self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) + + results = [] + for i in range(max_host_items): + for host_crc32, items in queue.items(): + if len(items) <= i: + continue + item = items[i] + (_, _, _, encoded, score) = item + to_remove.append(packb(item)) + request = self._decoder.decode_request(encoded) + request.meta[FIELD_SCORE] = score + results.append(request) + if len(to_remove) > 0: + connection.zrem(partition_id, *to_remove) + return results + + def schedule(self, batch): + to_schedule = dict() + now = int(time()) + for fprint, score, request, schedule in batch: + if schedule: + # TODO: This is done by DomainMiddleware - RedisBackend should require DomainMiddleware + if FIELD_DOMAIN not in request.meta: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if not hostname: + self._logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) + request.meta[FIELD_DOMAIN] = {'name': hostname} + timestamp = request.meta[FIELD_CRAWL_AT] if FIELD_CRAWL_AT in request.meta else now + to_schedule.setdefault(timestamp, []).append((request, score)) + for timestamp, batch in to_schedule.items(): + self._schedule(batch, timestamp) + + @classmethod + def get_interval_start(cls, score): + if score < cls.MIN_SCORE or score > cls.MAX_SCORE: + raise OverflowError + i = int(score / cls.SCORE_STEP) + if i % 10 == 0 and i > 0: + i -= 1 # last interval is inclusive from right + return i * cls.SCORE_STEP + + def _schedule(self, batch, timestamp): + data = dict() + for request, score in batch: + domain = request.meta[FIELD_DOMAIN] + fingerprint = request.meta[FIELD_FINGERPRINT] + if type(domain) == dict: + partition_id = self._partitioner.partition(domain[FIELD_NAME], self._partitions) + host_crc32 = get_crc32(domain[FIELD_NAME]) + elif type(domain) == int: + partition_id = self._partitioner.partition_by_hash(domain, self._partitions) + host_crc32 = domain + else: + raise TypeError("domain of unknown type.") + item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) + interval_start = self.get_interval_start(score) + data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for key, items in data.items(): + connection.zadd(key, *items) + pipe.execute() + + def count(self): + connection = StrictRedis(connection_pool=self._pool) + count = 0 + for partition_id in self._partitions: + count += connection.zcard(partition_id) + return count + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + +class RedisState(States): + def __init__(self, pool, cache_size_limit): + self._pool = pool + self._cache = {} + self._cache_size_limit = cache_size_limit + self._logger = logging.getLogger("redis.states") + + def update_cache(self, objs): + objs = objs if isinstance(objs, Iterable) else [objs] + + def put(obj): + self._cache[obj.meta[FIELD_FINGERPRINT]] = obj.meta[FIELD_STATE] + + [put(obj) for obj in objs] + + def set_states(self, objs): + objs = objs if isinstance(objs, Iterable) else [objs] + + def get(obj): + fprint = obj.meta[FIELD_FINGERPRINT] + obj.meta[FIELD_STATE] = self._cache[fprint] if fprint in self._cache else States.DEFAULT + + [get(obj) for obj in objs] + + def flush(self, force_clear): + if len(self._cache) > self._cache_size_limit: + force_clear = True + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for fprint, state in self._cache.items(): + pipe.hmset(fprint, {FIELD_STATE: state}) + pipe.execute() + if force_clear: + self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) + self._cache.clear() + + def fetch(self, fingerprints): + to_fetch = [f for f in fingerprints if f not in self._cache] + self._logger.debug("cache size %s" % len(self._cache)) + self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for key in to_fetch: + pipe.hgetall(key) + responses = pipe.execute() + for index, key in enumerate(to_fetch): + response = responses[index] + if len(response) > 0 and FIELD_STATE in response: + self._cache[key] = response[FIELD_STATE] + else: + self._cache[key] = self.NOT_CRAWLED + + def frontier_start(self): + pass + + def frontier_stop(self): + self.flush(False) + + +class RedisMetadata(Metadata): + def __init__(self, pool, delete_all_keys): + self._pool = pool + self._logger = logging.getLogger("redis.metadata") + if delete_all_keys: + connection = StrictRedis(connection_pool=self._pool) + connection.flushdb() + + @classmethod + def timestamp(cls): + return str(datetime.utcnow().replace(microsecond=0)) + + def add_seeds(self, seeds): + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for seed in seeds: + pipe.hmset( + seed.meta[FIELD_FINGERPRINT], + { + FIELD_URL: seed.url, + FIELD_DEPTH: 0, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + pipe.execute() + + def request_error(self, page, error): + connection = StrictRedis(connection_pool=self._pool) + connection.hmset( + page.meta[FIELD_FINGERPRINT], + { + FIELD_URL: page.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_ERROR: error, + FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + + def page_crawled(self, response): + connection = StrictRedis(connection_pool=self._pool) + connection.hmset( + response.meta[FIELD_FINGERPRINT], + { + FIELD_STATUS_CODE: response.status_code + } + ) + + def links_extracted(self, _, links): + links_processed = set() + connection = StrictRedis(connection_pool=self._pool) + for link in links: + link_fingerprint = link.meta[FIELD_FINGERPRINT] + if link_fingerprint in links_processed: + continue + connection.hmset( + link_fingerprint, + { + FIELD_URL: link.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + links_processed.add(link_fingerprint) + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + +class RedisBackend(DistributedBackend): + component_name = 'Redis Backend' + + def __init__(self, manager): + self.manager = manager + self._logger = logging.getLogger("redis.backend") + settings = manager.settings + port = settings.get('REDIS_PORT') + host = settings.get('REDIS_HOST') + self._min_requests = settings.get('BC_MIN_REQUESTS') + self._min_hosts = settings.get('BC_MIN_HOSTS') + self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') + + self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') + self._logger.info("RedisBackend started with {} partitions".format(self.queue_partitions)) + self.pool = ConnectionPool(host=host, port=port, db=0) + self._metadata = None + self._queue = None + self._states = None + + @classmethod + def strategy_worker(cls, manager): + o = cls(manager) + settings = manager.settings + o._states = RedisState(o.pool, settings.get('REDIS_STATE_CACHE_SIZE_LIMIT')) + return o + + @classmethod + def db_worker(cls, manager): + o = cls(manager) + settings = manager.settings + clear = settings.get('REDIS_DROP_ALL_TABLES') + o._queue = RedisQueue(o.pool, o.queue_partitions, delete_all_keys=clear) + o._metadata = RedisMetadata( + o.pool, + clear + ) + return o + + @property + def metadata(self): + return self._metadata + + @property + def queue(self): + return self._queue + + @property + def states(self): + return self._states + + def frontier_start(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_start() + + def frontier_stop(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_stop() + self.pool.disconnect() + + def add_seeds(self, seeds): + self.metadata.add_seeds(seeds) + + def page_crawled(self, response): + self.metadata.page_crawled(response) + + def links_extracted(self, request, links): + self.metadata.links_extracted(request, links) + + def request_error(self, page, error): + self.metadata.request_error(page, error) + + def finished(self): + raise NotImplementedError + + def get_next_requests(self, max_next_requests, **kwargs): + next_pages = [] + self._logger.debug("Querying queue table.") + partitions = set(kwargs.pop('partitions', [])) + for partition_id in range(0, self.queue_partitions): + if partition_id not in partitions: + continue + results = self.queue.get_next_requests(max_next_requests, partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host) + next_pages.extend(results) + self._logger.debug("Got %d requests for partition id %d", len(results), partition_id) + return next_pages diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py new file mode 100644 index 000000000..65d1a6ae1 --- /dev/null +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE +from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL +from frontera.contrib.backends.redis_backend import RedisMetadata, RedisQueue, RedisState +from redis import ConnectionPool, StrictRedis +from time import time +from unittest import main, TestCase + + +class Request: + def __init__(self, fingerprint, crawl_at, url, domain=None): + self.meta = { + b'crawl_at': crawl_at, + b'fingerprint': fingerprint + } + if domain: + self.meta[b'domain'] = {b'name': domain, b'fingerprint': "d_{}".format(fingerprint)} + self.url = url + self.method = 'https' + self.headers = {} + self.cookies = None + self.status_code = 200 + + +def get_pool(): + port = 32768 + host = '192.168.99.100' + return ConnectionPool(host=host, port=port, db=0) + + +class RedisQueueTest(TestCase): + def setup_subject(self, partitions): + return RedisQueue(get_pool(), partitions, True) + + def test_scheduling_past_1part_5(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(3, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.hellan.me/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_1part_1(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + def test_scheduling_past_1part_2(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(2, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + def test_scheduling_past_2part_5(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(5, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_2part_2(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(2, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(2, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_2part_1(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + def test_scheduling_past_2part_multiple(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(0, subject.count()) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + def test_scheduling_future(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) + 86400, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + def test_scheduling_mix(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(2, subject.count()) + + +class RedisStateTest(TestCase): + def test_update_cache(self): + subject = RedisState(get_pool(), 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + self.assertEqual(3, len(subject._cache)) + self.assertEqual(b'a', subject._cache["1"]) + self.assertEqual(b'b', subject._cache["2"]) + self.assertEqual(b'c', subject._cache["3"]) + + def test_set_states(self): + subject = RedisState(get_pool(), 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + r4 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r5 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r6 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + batch2 = [r4, r5, r6] + subject.set_states(batch2) + self.assertEqual(b'a', r4.meta[b'state']) + self.assertEqual(b'b', r5.meta[b'state']) + self.assertEqual(b'c', r6.meta[b'state']) + + def test_flush_no_force(self): + pool = get_pool() + subject = RedisState(pool, 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(False) + self.assertEqual(3, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'a'}, connection.hgetall("1")) + self.assertEqual({FIELD_STATE: b'b'}, connection.hgetall("2")) + self.assertEqual({FIELD_STATE: b'c'}, connection.hgetall("3")) + + def test_flush_force(self): + pool = get_pool() + subject = RedisState(pool, 10) + r1 = Request("4", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'd' + r2 = Request("5", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'e' + r3 = Request("6", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'f' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(True) + self.assertEqual(0, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'd'}, connection.hgetall("4")) + self.assertEqual({FIELD_STATE: b'e'}, connection.hgetall("5")) + self.assertEqual({FIELD_STATE: b'f'}, connection.hgetall("6")) + + def test_flush_cache_overflow(self): + pool = get_pool() + subject = RedisState(pool, 1) + r1 = Request("4", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'd' + r2 = Request("5", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'e' + r3 = Request("6", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'f' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(False) + self.assertEqual(0, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'd'}, connection.hgetall("4")) + self.assertEqual({FIELD_STATE: b'e'}, connection.hgetall("5")) + self.assertEqual({FIELD_STATE: b'f'}, connection.hgetall("6")) + + def test_fetch(self): + subject = RedisState(get_pool(), 1) + r1 = Request("7", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'g' + r2 = Request("8", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'h' + batch = [r1, r2] + subject.update_cache(batch) + subject.flush(True) + r3 = Request("9", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'i' + subject.update_cache(r3) + self.assertEqual(1, len(subject._cache)) + to_fetch = ["7", "9"] + subject.fetch(to_fetch) + self.assertEqual(2, len(subject._cache)) + self.assertEqual(b'g', subject._cache["7"]) + self.assertEqual(b'i', subject._cache["9"]) + + +class RedisMetadataTest(TestCase): + def test_add_seeds(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r2 = Request("md2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r3 = Request("md3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + seeds = [r1, r2, r3] + subject.add_seeds(seeds) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('md1', FIELD_URL)[0]) + self.assertEqual(b'd_md1', connection.hmget('md1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.khellan.com/', connection.hmget("md2", FIELD_URL)[0]) + self.assertEqual(b'd_md2', connection.hmget('md2', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.hellan.me/', connection.hmget("md3", FIELD_URL)[0]) + self.assertEqual(b'd_md3', connection.hmget('md3', FIELD_DOMAIN_FINGERPRINT)[0]) + + def test_request_error(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + subject.request_error(r1, 404) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('md1', FIELD_URL)[0]) + self.assertEqual(b'd_md1', connection.hmget('md1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'404', connection.hmget('md1', FIELD_ERROR)[0]) + + def test_page_crawled(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + subject.page_crawled(r1) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'200', connection.hmget('md1', FIELD_STATUS_CODE)[0]) + + def test_links_extracted(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + l1 = Request("l1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + l2 = Request("l2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + l3 = Request("l3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + links = [l1, l2, l3] + subject.links_extracted(None, links) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('l1', FIELD_URL)[0]) + self.assertEqual(b'd_l1', connection.hmget('l1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.khellan.com/', connection.hmget("l2", FIELD_URL)[0]) + self.assertEqual(b'd_l2', connection.hmget('l2', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.hellan.me/', connection.hmget("l3", FIELD_URL)[0]) + self.assertEqual(b'd_l3', connection.hmget('l3', FIELD_DOMAIN_FINGERPRINT)[0]) + + +if __name__ == '__main__': + main() From 9c76a1bd008571b77763e3a270b96aa6366bac9a Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 21 Feb 2017 10:41:14 +0100 Subject: [PATCH 013/273] Update after review Correct function header for get_next_requests in RedisQueue. Remove all references to the min_requests parameter. --- frontera/contrib/backends/redis_backend/__init__.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 2f095942c..d403e3e09 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -26,8 +26,6 @@ FIELD_STATUS_CODE = b'status_code' FIELD_URL = b'url' -logging.getLogger('boto3.resources.action').setLevel(logging.WARNING) - class RedisQueue(Queue): MAX_SCORE = 1.0 @@ -52,15 +50,12 @@ class DumbResponse: def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ - Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all - parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue. + Fet new batch from priority queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from :return: list of :class:`Request ` objects. """ - min_requests = kwargs.pop('min_requests') max_requests_per_host = kwargs.pop('max_requests_per_host') - assert (max_n_requests >= min_requests) connection = StrictRedis(connection_pool=self._pool) queue = {} count = 0 @@ -303,7 +298,6 @@ def __init__(self, manager): settings = manager.settings port = settings.get('REDIS_PORT') host = settings.get('REDIS_HOST') - self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') @@ -379,7 +373,6 @@ def get_next_requests(self, max_next_requests, **kwargs): if partition_id not in partitions: continue results = self.queue.get_next_requests(max_next_requests, partition_id, - min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) From 0cf2f3763b5da52ff203a9acd87cee14e7627ce6 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 21 Feb 2017 10:43:37 +0100 Subject: [PATCH 014/273] Set up Redis testing in TravisCI Add redis-server to travis config. Use Travis config for Redis in tests. --- .travis.yml | 1 + tests/contrib/backends/redis_backend/test_redis.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a2f138639..88a36a1e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ services: - docker - mysql - postgresql + - redis-server env: global: diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 65d1a6ae1..a0d55daba 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -24,8 +24,8 @@ def __init__(self, fingerprint, crawl_at, url, domain=None): def get_pool(): - port = 32768 - host = '192.168.99.100' + port = 9763 + host = 'localhost' return ConnectionPool(host=host, port=port, db=0) From fb422c658ede3a1cc79a0b9261cee7f521b3f2a2 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 21 Feb 2017 14:30:55 +0100 Subject: [PATCH 015/273] Added redis to install list Add redis and hiredis to requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7c718e7af..088a2579f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ six>=1.8.0 w3lib>=1.15.0 +redis>=2.10.5 +hiredis>=0.2 From b6b964628c1cc92b44df88a718246b77f74a6228 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 21 Feb 2017 15:43:26 +0100 Subject: [PATCH 016/273] Bugfix Redis port Set port number 6379 for Redis. --- tests/contrib/backends/redis_backend/test_redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index a0d55daba..8a0520705 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -24,7 +24,7 @@ def __init__(self, fingerprint, crawl_at, url, domain=None): def get_pool(): - port = 9763 + port = 6379 host = 'localhost' return ConnectionPool(host=host, port=port, db=0) From 228c7f4bb2c1a2358fb7be9ed72ac01787f6460d Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 21 Feb 2017 15:59:42 +0100 Subject: [PATCH 017/273] Bugfix Travis config Add redis service to 3.5 config. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 88a36a1e4..e56db0d0a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,6 +27,7 @@ matrix: - docker - mysql - postgresql + - redis-server install: - pip install -U tox wheel codecov From b0ee89a5455adc856fec27694dde10c050bbd16a Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 10:00:39 +0100 Subject: [PATCH 018/273] Improved dependecy handling Use setup.py and not requirements.txt to manage test requirements. --- requirements.txt | 2 -- setup.py | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 088a2579f..7c718e7af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ six>=1.8.0 w3lib>=1.15.0 -redis>=2.10.5 -hiredis>=0.2 diff --git a/setup.py b/setup.py index 5f305a258..22392f4b8 100644 --- a/setup.py +++ b/setup.py @@ -84,6 +84,8 @@ "mock", "boto>=2.42.0", "colorlog>=2.4.0", - "python-json-logger>=0.1.5" + "python-json-logger>=0.1.5", + "redis>=2.10.5", + "hiredis>=0.2" ] ) From 5f9fc23199cf1fa11cf43a05a83f21c20e71943e Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 10:02:38 +0100 Subject: [PATCH 019/273] Cleanup log naming Use redis_backend for logging to avoid conflicts with the main redis module. --- frontera/contrib/backends/redis_backend/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index d403e3e09..86935af59 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -36,7 +36,7 @@ def __init__(self, pool, partitions, delete_all_keys=False): self._pool = pool self._partitions = [i for i in range(0, partitions)] self._partitioner = Crc32NamePartitioner(self._partitions) - self._logger = logging.getLogger("redis.queue") + self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: connection = StrictRedis(connection_pool=self._pool) @@ -163,7 +163,7 @@ def __init__(self, pool, cache_size_limit): self._pool = pool self._cache = {} self._cache_size_limit = cache_size_limit - self._logger = logging.getLogger("redis.states") + self._logger = logging.getLogger("redis_backend.states") def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] @@ -220,7 +220,7 @@ def frontier_stop(self): class RedisMetadata(Metadata): def __init__(self, pool, delete_all_keys): self._pool = pool - self._logger = logging.getLogger("redis.metadata") + self._logger = logging.getLogger("redis_backend.metadata") if delete_all_keys: connection = StrictRedis(connection_pool=self._pool) connection.flushdb() @@ -294,7 +294,7 @@ class RedisBackend(DistributedBackend): def __init__(self, manager): self.manager = manager - self._logger = logging.getLogger("redis.backend") + self._logger = logging.getLogger("redis_backend.backend") settings = manager.settings port = settings.get('REDIS_PORT') host = settings.get('REDIS_HOST') From 0d8fcffcfddca52ef8afe15b30d33539e6416743 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 10:07:09 +0100 Subject: [PATCH 020/273] Refactor code loading in redis backend Load codecs based on settings and not in a hardcoded way. --- .../backends/redis_backend/__init__.py | 20 +++++++++---------- frontera/settings/default_settings.py | 1 + .../backends/redis_backend/test_redis.py | 8 ++++++-- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 86935af59..3eebe51fe 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -4,10 +4,8 @@ from frontera.utils.url import parse_domain_from_url_fast from frontera import DistributedBackend from frontera.core.components import Metadata, Queue, States -from frontera.core.models import Request from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.utils.misc import get_crc32 -from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder +from frontera.utils.misc import get_crc32, load_object import logging from msgpack import packb, unpackb from redis import ConnectionPool, StrictRedis @@ -32,7 +30,13 @@ class RedisQueue(Queue): MIN_SCORE = 0.0 SCORE_STEP = 0.01 - def __init__(self, pool, partitions, delete_all_keys=False): + def __init__(self, manager, pool, partitions, delete_all_keys=False): + settings = manager.settings + codec_path = settings.get('BACKEND_CODEC') + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") + self._encoder = encoder_cls(manager.request_model) + self._decoder = decoder_cls(manager.request_model, manager.response_model) self._pool = pool self._partitions = [i for i in range(0, partitions)] self._partitioner = Crc32NamePartitioner(self._partitions) @@ -42,12 +46,6 @@ def __init__(self, pool, partitions, delete_all_keys=False): connection = StrictRedis(connection_pool=self._pool) connection.flushdb() - class DumbResponse: - pass - - self._decoder = Decoder(Request, DumbResponse) - self._encoder = Encoder(Request) - def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Fet new batch from priority queue. @@ -320,7 +318,7 @@ def db_worker(cls, manager): o = cls(manager) settings = manager.settings clear = settings.get('REDIS_DROP_ALL_TABLES') - o._queue = RedisQueue(o.pool, o.queue_partitions, delete_all_keys=clear) + o._queue = RedisQueue(manager, o.pool, o.queue_partitions, delete_all_keys=clear) o._metadata = RedisMetadata( o.pool, clear diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b049e7bdc..7ff333396 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -4,6 +4,7 @@ AUTO_START = True BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 8a0520705..331ce1676 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -3,6 +3,8 @@ from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL from frontera.contrib.backends.redis_backend import RedisMetadata, RedisQueue, RedisState +from frontera.core.manager import FrontierManager +from frontera.settings import Settings from redis import ConnectionPool, StrictRedis from time import time from unittest import main, TestCase @@ -30,8 +32,10 @@ def get_pool(): class RedisQueueTest(TestCase): - def setup_subject(self, partitions): - return RedisQueue(get_pool(), partitions, True) + @staticmethod + def setup_subject(partitions): + settings = Settings(module='frontera.settings.default_settings') + return RedisQueue(FrontierManager.from_settings(settings), get_pool(), partitions, True) def test_scheduling_past_1part_5(self): subject = self.setup_subject(1) From 8b9eb11e7e9a255245aa026a232cd9d0af196a84 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 10:46:40 +0100 Subject: [PATCH 021/273] Refactor get_next_requests in RedisBackend Iterate properly over partitions. Tests that this actually works. Add default REDIS_HOST and REDIS_PORT parameters. --- .../backends/redis_backend/__init__.py | 4 +-- frontera/settings/default_settings.py | 2 ++ .../backends/redis_backend/test_redis.py | 25 ++++++++++++++++++- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 3eebe51fe..e723b09e0 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -367,9 +367,7 @@ def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self._logger.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) - for partition_id in range(0, self.queue_partitions): - if partition_id not in partitions: - continue + for partition_id in partitions: results = self.queue.get_next_requests(max_next_requests, partition_id, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 7ff333396..b264c5ffc 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -33,6 +33,8 @@ NEW_BATCH_DELAY = 30.0 OVERUSED_SLOT_FACTOR = 5.0 QUEUE_HOSTNAME_PARTITIONING = False +REDIS_HOST = 'localhost' +REDIS_PORT = 6379 REQUEST_MODEL = 'frontera.core.models.Request' RESPONSE_MODEL = 'frontera.core.models.Response' diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 331ce1676..a226b8ac8 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL -from frontera.contrib.backends.redis_backend import RedisMetadata, RedisQueue, RedisState +from frontera.contrib.backends.redis_backend import RedisBackend, RedisMetadata, RedisQueue, RedisState from frontera.core.manager import FrontierManager from frontera.settings import Settings from redis import ConnectionPool, StrictRedis @@ -374,6 +374,29 @@ def test_links_extracted(self): self.assertEqual(b'https://www.hellan.me/', connection.hmget("l3", FIELD_URL)[0]) self.assertEqual(b'd_l3', connection.hmget('l3', FIELD_DOMAIN_FINGERPRINT)[0]) +class RedisBackendTest(TestCase): + @staticmethod + def setup_subject(partitions): + settings = Settings(module='frontera.settings.default_settings') + settings.set('SPIDER_FEED_PARTITIONS', partitions) + settings.set('REDIS_DROP_ALL_TABLES', True) + return RedisBackend.db_worker(FrontierManager.from_settings(settings)) + + def test_get_next_request(self): + subject = self.setup_subject(2) + requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) + self.assertEqual(0, len(requests)) + + def test_get_next_request_has_requests(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.queue.schedule(batch) + requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) + self.assertEqual(3, len(requests)) if __name__ == '__main__': main() From e5fc4032ed31e1a017bc887149e20d0fdc2a336c Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 13:08:46 +0100 Subject: [PATCH 022/273] Bugfix setup.py Add redis dependencies to extras_require --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 22392f4b8..1885c74a7 100644 --- a/setup.py +++ b/setup.py @@ -71,6 +71,10 @@ ], 'distributed': [ 'Twisted' + ], + 'redis': [ + 'redis>=2.10.5', + 'hiredis>=0.2' ] }, tests_require=[ From 9f175c48fa9de89ed0d4822e5f0138d9258ed439 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 16:31:54 +0100 Subject: [PATCH 023/273] Bugfix requirements Add redis requirements to tests requirements. --- requirements/tests.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/tests.txt b/requirements/tests.txt index 455cd0c35..a18d14709 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -13,3 +13,5 @@ happybase>=1.0.0 mock boto>=2.42.0 -r logging.txt +redis>=2.10.5 +hiredis>=0.2x From 186335e63e4ddf5b04bd604d57eb96dae6551271 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 22 Feb 2017 21:05:26 +0100 Subject: [PATCH 024/273] Bugfix spurious letter Remove random x in the hiredis dependency spec. --- requirements/tests.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/tests.txt b/requirements/tests.txt index a18d14709..d73107e03 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -14,4 +14,4 @@ mock boto>=2.42.0 -r logging.txt redis>=2.10.5 -hiredis>=0.2x +hiredis>=0.2 From 2ffdef8d34da7e0c5eb4db92f4fd5cdb13b4bdc4 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Thu, 23 Feb 2017 09:22:37 +0100 Subject: [PATCH 025/273] Bugfix available blocked Correct the get_next_requests in RedisQueue to prevent low priority requests from being blocked by high priority requests scheduled for the future. --- .../backends/redis_backend/__init__.py | 41 +++++++++++-------- .../backends/redis_backend/test_redis.py | 19 +++++++++ 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index e723b09e0..00fdd79d1 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -48,7 +48,7 @@ def __init__(self, manager, pool, partitions, delete_all_keys=False): def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ - Fet new batch from priority queue. + Fetch new batch from priority queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from :return: list of :class:`Request ` objects. @@ -60,23 +60,28 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): now_ts = int(time()) max_host_items = 0 to_remove = [] - for data in connection.zrevrange(partition_id, start=0, end=max_n_requests): - item = unpackb(data, use_list=False) - timestamp, fprint, host_crc32, _, score = item - if timestamp > now_ts: - continue - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: - continue - queue[host_crc32].append(item) - if len(queue[host_crc32]) > max_host_items: - max_host_items = len(queue[host_crc32]) - count += 1 - to_remove.append(data) - - if count >= max_n_requests: - break + start = 0 + last_start = -1 + while count < max_n_requests and last_start < start: + last_start = start + for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): + start += 1 + item = unpackb(data, use_list=False) + timestamp, fprint, host_crc32, _, score = item + if timestamp > now_ts: + continue + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + continue + queue[host_crc32].append(item) + if len(queue[host_crc32]) > max_host_items: + max_host_items = len(queue[host_crc32]) + count += 1 + to_remove.append(data) + + if count >= max_n_requests: + break self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index a226b8ac8..34fee6421 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -213,6 +213,25 @@ def test_scheduling_mix(self): self.assertTrue('https://www.khellan.com/' in urls) self.assertEqual(2, subject.count()) + def test_scheduling_conflict_high_score_high_timestamp(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("4", 0.7, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("5", 0.8, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("6", 0.9, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(6, subject.count()) + + requests = subject.get_next_requests(2, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(5, subject.count()) + class RedisStateTest(TestCase): def test_update_cache(self): From 2b0401fa141956697d94fb86d591fa1090c2082f Mon Sep 17 00:00:00 2001 From: xsren Date: Fri, 24 Feb 2017 15:28:57 +0800 Subject: [PATCH 026/273] Update run-modes.rst --- docs/source/topics/run-modes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/topics/run-modes.rst b/docs/source/topics/run-modes.rst index a323d8377..e07806897 100644 --- a/docs/source/topics/run-modes.rst +++ b/docs/source/topics/run-modes.rst @@ -31,7 +31,7 @@ spiders using :term:`message bus`. 1. Use :setting:`BACKEND` in spider processes set to :class:`MessageBusBackend ` -2. In DB worker :setting:`BACKEND` should point to :class:`Backend ` subclasse. +2. In DB worker :setting:`BACKEND` should point to :class:`Backend ` subclass. 3. Every spider process should have it's own :setting:`SPIDER_PARTITION_ID`, starting from 0 to :setting:`SPIDER_FEED_PARTITIONS`. 4. Both spiders and workers should have it's :setting:`MESSAGE_BUS` setting set to the message bus class of your choice, From b0ed767b776ec74381a130d29d2159b0649a6f67 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 8 Mar 2017 13:43:15 +0100 Subject: [PATCH 027/273] Rename redis_backend setting Rename BACKEND_CODEC into REDIS_BACKEND_CODES to show it does not affect all backends. --- frontera/contrib/backends/redis_backend/__init__.py | 2 +- frontera/settings/default_settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 00fdd79d1..6805663cb 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -32,7 +32,7 @@ class RedisQueue(Queue): def __init__(self, manager, pool, partitions, delete_all_keys=False): settings = manager.settings - codec_path = settings.get('BACKEND_CODEC') + codec_path = settings.get('REDIS_BACKEND_CODEC') encoder_cls = load_object(codec_path + ".Encoder") decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(manager.request_model) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b264c5ffc..3bf72a003 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -4,7 +4,6 @@ AUTO_START = True BACKEND = 'frontera.contrib.backends.memory.FIFO' -BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 @@ -33,6 +32,7 @@ NEW_BATCH_DELAY = 30.0 OVERUSED_SLOT_FACTOR = 5.0 QUEUE_HOSTNAME_PARTITIONING = False +REDIS_BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' REDIS_HOST = 'localhost' REDIS_PORT = 6379 REQUEST_MODEL = 'frontera.core.models.Request' From 03d50f8a4ddd2a7d1e4d36ea95d38ab5280d6e7f Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 14 Mar 2017 16:55:46 +0100 Subject: [PATCH 028/273] including None in the objects which can pass safely --- frontera/contrib/backends/remote/codecs/msgpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 6be589dae..247db1544 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -16,7 +16,7 @@ def _prepare_request_message(request): def serialize(obj): """Recursively walk object's hierarchy.""" - if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)): + if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj == None: return obj elif isinstance(obj, dict): obj = obj.copy() From cb4c352c6b9cec7dbce29e5f0657fb1321975126 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 14 Mar 2017 17:04:08 +0100 Subject: [PATCH 029/273] sending None if offset isn't available --- frontera/contrib/backends/remote/codecs/msgpack.py | 2 +- frontera/contrib/messagebus/zeromq/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 247db1544..37825cbbc 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -16,7 +16,7 @@ def _prepare_request_message(request): def serialize(obj): """Recursively walk object's hierarchy.""" - if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj == None: + if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj is None: return obj elif isinstance(obj, dict): obj = obj.copy() diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index ab1a56155..3f99fc973 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -98,7 +98,7 @@ def flush(self): pass def get_offset(self, partition_id): - return self.counters[partition_id] + return self.counters.get(partition_id, None) class SpiderLogProducer(Producer): From c25d11e8bfe0dbf3c004156461d5b8fb3d536bbb Mon Sep 17 00:00:00 2001 From: xsren Date: Tue, 4 Apr 2017 21:33:53 +0800 Subject: [PATCH 030/273] Update frontier-backends.rst fix some typo --- docs/source/topics/frontier-backends.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 706da8377..041d63c90 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -161,7 +161,7 @@ To know the default activated :class:`Backend Basic algorithms ^^^^^^^^^^^^^^^^ -Some of the built-in :class:`Backend ` objects implement basic algorithms as +Some of the built-in :class:`Backend ` objects implement basic algorithms such as `FIFO`_/`LIFO`_ or `DFS`_/`BFS`_ for page visit ordering. Differences between them will be on storage engine used. For instance, From 87a43c2c92e1375991e849fa8125287a23f9d664 Mon Sep 17 00:00:00 2001 From: xsren Date: Wed, 5 Apr 2017 21:59:43 +0800 Subject: [PATCH 031/273] Update frontier-backends.rst --- docs/source/topics/frontier-backends.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 041d63c90..add2d78c0 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -278,10 +278,7 @@ HBase backend Is more suitable for large scale web crawlers. Settings reference can be found here :ref:`hbase-settings`. Consider tunning a block cache to fit states within one block for average size website. To achieve this it's recommended to use -:attr:`hostname_local_fingerprint ` - -to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` -setting. +:attr:`hostname_local_fingerprint ` to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. .. TODO: document details of block cache tuning, BC* settings and queue get operation concept, From fe56901175834fa598524fca0b47de608783d0e2 Mon Sep 17 00:00:00 2001 From: Anatolii Aniskovych Date: Sat, 15 Apr 2017 11:08:22 +0300 Subject: [PATCH 032/273] Fix error raising in msgpack and json decoders --- frontera/contrib/backends/remote/codecs/json.py | 2 +- frontera/contrib/backends/remote/codecs/msgpack.py | 2 +- tests/test_codecs.py | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index ef4aa538d..135c44d83 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -190,7 +190,7 @@ def decode(self, message): return ('new_job_id', int(message['job_id'])) if message['type'] == 'offset': return ('offset', int(message['partition_id']), int(message['offset'])) - return TypeError('Unknown message type') + raise TypeError('Unknown message type') def decode_request(self, message): obj = _convert_from_saved_type(super(Decoder, self).decode(message)) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 37825cbbc..c53f9bf02 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -108,7 +108,7 @@ def decode(self, buffer): return ('new_job_id', int(obj[1])) if obj[0] == b'of': return ('offset', int(obj[1]), int(obj[2])) - return TypeError('Unknown message type') + raise TypeError('Unknown message type') def decode_request(self, buffer): return self._request_from_object(unpackb(buffer, encoding='utf-8')) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 82136f14b..e4d59348c 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -41,12 +41,12 @@ def _compare_dicts(dict1, dict2): @pytest.mark.parametrize('send_body', [True, False]) @pytest.mark.parametrize( - ('encoder', 'decoder'), [ - (MsgPackEncoder, MsgPackDecoder), - (JsonEncoder, JsonDecoder) + ('encoder', 'decoder', 'invalid_value'), [ + (MsgPackEncoder, MsgPackDecoder, b'\x91\xc4\x04test'), + (JsonEncoder, JsonDecoder, b'["dict", [[["bytes", "type"], ["bytes", "test"]]]]') ] ) -def test_codec(encoder, decoder, send_body): +def test_codec(encoder, decoder, send_body, invalid_value): def check_request(req1, req2): assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method @@ -65,7 +65,8 @@ def check_request(req1, req2): enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), - enc.encode_request(req) + enc.encode_request(req), + invalid_value, ] it = iter(msgs) @@ -119,6 +120,9 @@ def check_request(req1, req2): o = dec.decode_request(next(it)) check_request(o, req) + with pytest.raises(TypeError): + dec.decode(next(it)) + class TestEncodeDecodeJson(unittest.TestCase): """ From 51a50b2f14e3546f47a76020b9d43ce1bd8210bf Mon Sep 17 00:00:00 2001 From: isra17 Date: Thu, 20 Apr 2017 10:48:35 -0400 Subject: [PATCH 033/273] Commit database cleared content --- frontera/contrib/backends/sqlalchemy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index b8e7b8aa1..810975d67 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -35,6 +35,7 @@ def __init__(self, manager): session = self.session_cls() for name, table in DeclarativeBase.metadata.tables.items(): session.execute(table.delete()) + session.commit() session.close() self._metadata = Metadata(self.session_cls, self.models['MetadataModel'], settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) From 276f6e2ea9da4513a2eba5ed9d536c12f477a20c Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 2 May 2017 11:50:27 +0200 Subject: [PATCH 034/273] adding hostname to md5 part --- frontera/utils/fingerprint.py | 16 +++++++--------- tests/test_fingerprint.py | 12 ++++++------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/frontera/utils/fingerprint.py b/frontera/utils/fingerprint.py index 97bb55385..b491ebcb6 100644 --- a/frontera/utils/fingerprint.py +++ b/frontera/utils/fingerprint.py @@ -1,11 +1,10 @@ from __future__ import absolute_import import hashlib -from six.moves.urllib.parse import urlparse from struct import pack from binascii import hexlify from frontera.utils.misc import get_crc32 from frontera.utils.url import parse_url -from w3lib.util import to_native_str, to_bytes +from w3lib.util import to_bytes def sha1(key): @@ -27,12 +26,11 @@ def hostname_local_fingerprint(key): :return: str 20 bytes hex string """ result = parse_url(key) - if not result.hostname: - return sha1(key) - host_checksum = get_crc32(result.hostname) - doc_uri_combined = result.path+';'+result.params+result.query+result.fragment + hostname = result.hostname if result.hostname else '-' + host_checksum = get_crc32(hostname) + combined = hostname+result.path+';'+result.params+result.query+result.fragment - doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') - doc_fprint = hashlib.md5(doc_uri_combined).digest() + combined = to_bytes(combined, 'utf8', 'ignore') + doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) - return fprint + return fprint \ No newline at end of file diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index f4b4ca33b..0ea37083f 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -32,11 +32,11 @@ def test_md5_unicode(self): assert md5(url3) == b'5abf5c9aa02d870756032bdec0bd6522' def test_local_hostname_fingerprint_bytes(self): - assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13' + assert hostname_local_fingerprint(to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' + assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76' def test_local_hostname_frongerprint_unicode(self): - assert hostname_local_fingerprint(url1) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(url2) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(url3) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert hostname_local_fingerprint(url1) == b'1be68ff5587d241e22865288133b37d63ab49e13' + assert hostname_local_fingerprint(url2) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' + assert hostname_local_fingerprint(url3) == b'2ed642bb1e215e68ef283a1939252734e84c3c76' \ No newline at end of file From cf9426558bb8b946708cb2de57fc3d48f580b8e5 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 4 May 2017 09:58:42 +0200 Subject: [PATCH 035/273] using rk prefix instead of filter --- frontera/contrib/backends/hbase.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 8f60cb6d3..1b7e29c91 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -197,7 +197,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): count = 0 prefix = '%d_' % partition_id now_ts = int(time()) - filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) + while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 @@ -206,7 +206,9 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - for rk, data in table.scan(limit=int(limit), batch_size=256, filter=filter): + # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) + # TODO: figure out how to use filter here, Thrift filter above causes full scan + for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): for cq, buf in six.iteritems(data): if cq == b'f:t': continue From f5684cc4b964e347490df997f74b9cb215223dc5 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 24 May 2017 14:08:45 +0200 Subject: [PATCH 036/273] Feature retry with backoff Retry writing to Redis on ResponseError. This typically happens if there is a transient or fixable issue such as out of memory (aka full Redis). An error message is logged and then the operation is retried later. --- .../backends/redis_backend/__init__.py | 162 ++++++++++++------ 1 file changed, 106 insertions(+), 56 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 6805663cb..9ebef68a9 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -9,7 +9,8 @@ import logging from msgpack import packb, unpackb from redis import ConnectionPool, StrictRedis -from time import time +from redis.exceptions import ResponseError +from time import sleep, time FIELD_CRAWL_AT = b'crawl_at' FIELD_CREATED_AT = b'created_at' @@ -25,6 +26,13 @@ FIELD_URL = b'url' +# Timeout generator with backoff until 30 seconds +def _get_retry_timeouts(): + for timeout in [5, 10, 20, 30, 30]: yield timeout + while True: + yield 30 + + class RedisQueue(Queue): MAX_SCORE = 1.0 MIN_SCORE = 0.0 @@ -141,11 +149,18 @@ def _schedule(self, batch, timestamp): item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for key, items in data.items(): - connection.zadd(key, *items) - pipe.execute() + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for key, items in data.items(): + connection.zadd(key, *items) + pipe.execute() + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) def count(self): connection = StrictRedis(connection_pool=self._pool) @@ -188,11 +203,18 @@ def get(obj): def flush(self, force_clear): if len(self._cache) > self._cache_size_limit: force_clear = True - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for fprint, state in self._cache.items(): - pipe.hmset(fprint, {FIELD_STATE: state}) - pipe.execute() + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for fprint, state in self._cache.items(): + pipe.hmset(fprint, {FIELD_STATE: state}) + pipe.execute() + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -233,57 +255,85 @@ def timestamp(cls): return str(datetime.utcnow().replace(microsecond=0)) def add_seeds(self, seeds): - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for seed in seeds: - pipe.hmset( - seed.meta[FIELD_FINGERPRINT], - { - FIELD_URL: seed.url, - FIELD_DEPTH: 0, - FIELD_CREATED_AT: self.timestamp(), - FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) - pipe.execute() + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for seed in seeds: + pipe.hmset( + seed.meta[FIELD_FINGERPRINT], + { + FIELD_URL: seed.url, + FIELD_DEPTH: 0, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + pipe.execute() + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) def request_error(self, page, error): - connection = StrictRedis(connection_pool=self._pool) - connection.hmset( - page.meta[FIELD_FINGERPRINT], - { - FIELD_URL: page.url, - FIELD_CREATED_AT: self.timestamp(), - FIELD_ERROR: error, - FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + connection.hmset( + page.meta[FIELD_FINGERPRINT], + { + FIELD_URL: page.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_ERROR: error, + FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) def page_crawled(self, response): - connection = StrictRedis(connection_pool=self._pool) - connection.hmset( - response.meta[FIELD_FINGERPRINT], - { - FIELD_STATUS_CODE: response.status_code - } - ) + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + connection.hmset( + response.meta[FIELD_FINGERPRINT], + { + FIELD_STATUS_CODE: response.status_code + } + ) + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) def links_extracted(self, _, links): - links_processed = set() - connection = StrictRedis(connection_pool=self._pool) - for link in links: - link_fingerprint = link.meta[FIELD_FINGERPRINT] - if link_fingerprint in links_processed: - continue - connection.hmset( - link_fingerprint, - { - FIELD_URL: link.url, - FIELD_CREATED_AT: self.timestamp(), - FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) - links_processed.add(link_fingerprint) + timeout = _get_retry_timeouts() + while True: + try: + links_processed = set() + connection = StrictRedis(connection_pool=self._pool) + for link in links: + link_fingerprint = link.meta[FIELD_FINGERPRINT] + if link_fingerprint in links_processed: + continue + connection.hmset( + link_fingerprint, + { + FIELD_URL: link.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + ) + links_processed.add(link_fingerprint) + break + except ResponseError as e: + self._logger.warning(e.message) + sleep(timeout.next()) def frontier_start(self): pass From 713dd39f22c9b1608f82e7642bd2f03a77b5f54c Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 29 May 2017 09:26:37 +0200 Subject: [PATCH 037/273] Handle Redis Connection errors Add handling of Connection errors from Redis. On connection error, the redis backend will retry 3 times with increasing timeout. On other write errors, the crawler will crash and die. The usual reason for this error is the Redis has used all available memory. --- .../backends/redis_backend/__init__.py | 69 ++++++++++++++----- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 9ebef68a9..8f7fc010b 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -9,7 +9,7 @@ import logging from msgpack import packb, unpackb from redis import ConnectionPool, StrictRedis -from redis.exceptions import ResponseError +from redis.exceptions import ConnectionError, ResponseError from time import sleep, time FIELD_CRAWL_AT = b'crawl_at' @@ -26,11 +26,10 @@ FIELD_URL = b'url' -# Timeout generator with backoff until 30 seconds +# Timeout generator with backoff until 60 seconds def _get_retry_timeouts(): - for timeout in [5, 10, 20, 30, 30]: yield timeout - while True: - yield 30 + for timeout in [0, 10, 30]: yield timeout + yield None class RedisQueue(Queue): @@ -158,9 +157,15 @@ def _schedule(self, batch, timestamp): connection.zadd(key, *items) pipe.execute() break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to schedule items") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to schedule items") + raise def count(self): connection = StrictRedis(connection_pool=self._pool) @@ -212,9 +217,15 @@ def flush(self, force_clear): pipe.hmset(fprint, {FIELD_STATE: state}) pipe.execute() break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to flush cache") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to flush cache") + raise if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -272,9 +283,15 @@ def add_seeds(self, seeds): ) pipe.execute() break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to add seeds") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to add seeds") + raise def request_error(self, page, error): timeout = _get_retry_timeouts() @@ -291,9 +308,15 @@ def request_error(self, page, error): } ) break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to write request error") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to write request error") + raise def page_crawled(self, response): timeout = _get_retry_timeouts() @@ -307,9 +330,15 @@ def page_crawled(self, response): } ) break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to write page crawled status") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to write page crawled status") + raise def links_extracted(self, _, links): timeout = _get_retry_timeouts() @@ -331,9 +360,15 @@ def links_extracted(self, _, links): ) links_processed.add(link_fingerprint) break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to write links extracted") + pause = timeout.next() + if pause == None: + raise + sleep(pause) except ResponseError as e: - self._logger.warning(e.message) - sleep(timeout.next()) + self._logger.exception("Writing to Redis failed when attempting to write links extracted") + raise def frontier_start(self): pass From 3a5a79a538c81d22857564fb7a9203b12d069f02 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 29 May 2017 09:48:29 +0200 Subject: [PATCH 038/273] Comment added about Redis errors Add description of Connection error handling. Add description of response error handling. --- frontera/contrib/backends/redis_backend/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 8f7fc010b..765fab2a5 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -26,6 +26,18 @@ FIELD_URL = b'url' +""" +Error handling: +* On Connection error: +** Retry three times with increasing timout. +** Fail and report error if the third retry fails. +* On Response error: +** Report and continue. +** Reponse error is usually caused by Redis using all available memory. Ideally, Redis should have enough memory + for this not to happen. Still, if Redis is full, the rest of the crawler may continue and free up some space in + Redis after a while. +""" + # Timeout generator with backoff until 60 seconds def _get_retry_timeouts(): for timeout in [0, 10, 30]: yield timeout From 8917df472ff3372d330889cbf6abd876524e1d3a Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 29 May 2017 09:59:16 +0200 Subject: [PATCH 039/273] Handle Redis error on reading Add handling of Redis errors when reading from the database. Log and ignore Response errors. This should not stop any process. --- .../backends/redis_backend/__init__.py | 139 ++++++++++++------ 1 file changed, 97 insertions(+), 42 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 765fab2a5..f7abb54ed 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -73,7 +73,6 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): :return: list of :class:`Request ` objects. """ max_requests_per_host = kwargs.pop('max_requests_per_host') - connection = StrictRedis(connection_pool=self._pool) queue = {} count = 0 now_ts = int(time()) @@ -83,24 +82,37 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): last_start = -1 while count < max_n_requests and last_start < start: last_start = start - for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): - start += 1 - item = unpackb(data, use_list=False) - timestamp, fprint, host_crc32, _, score = item - if timestamp > now_ts: - continue - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: - continue - queue[host_crc32].append(item) - if len(queue[host_crc32]) > max_host_items: - max_host_items = len(queue[host_crc32]) - count += 1 - to_remove.append(data) - - if count >= max_n_requests: + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): + start += 1 + item = unpackb(data, use_list=False) + timestamp, fprint, host_crc32, _, score = item + if timestamp > now_ts: + continue + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + continue + queue[host_crc32].append(item) + if len(queue[host_crc32]) > max_host_items: + max_host_items = len(queue[host_crc32]) + count += 1 + to_remove.append(data) + + if count >= max_n_requests: + break break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to get more requests") + pause = timeout.next() + if pause == None: + raise + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to get more requests") self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) @@ -116,7 +128,20 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): request.meta[FIELD_SCORE] = score results.append(request) if len(to_remove) > 0: - connection.zrem(partition_id, *to_remove) + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + connection.zrem(partition_id, *to_remove) + break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to remove scheduled items") + pause = timeout.next() + if pause == None: + raise + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to remove scheduled items") return results def schedule(self, batch): @@ -177,13 +202,24 @@ def _schedule(self, batch, timestamp): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to schedule items") - raise def count(self): - connection = StrictRedis(connection_pool=self._pool) - count = 0 - for partition_id in self._partitions: - count += connection.zcard(partition_id) + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + count = 0 + for partition_id in self._partitions: + count += connection.zcard(partition_id) + break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to count items") + pause = timeout.next() + if pause == None: + raise + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to count items") return count def frontier_start(self): @@ -237,7 +273,6 @@ def flush(self, force_clear): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to flush cache") - raise if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -246,17 +281,29 @@ def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self._logger.debug("cache size %s" % len(self._cache)) self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for key in to_fetch: - pipe.hgetall(key) - responses = pipe.execute() - for index, key in enumerate(to_fetch): - response = responses[index] - if len(response) > 0 and FIELD_STATE in response: - self._cache[key] = response[FIELD_STATE] - else: - self._cache[key] = self.NOT_CRAWLED + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + for key in to_fetch: + pipe.hgetall(key) + responses = pipe.execute() + for index, key in enumerate(to_fetch): + response = responses[index] + if len(response) > 0 and FIELD_STATE in response: + self._cache[key] = response[FIELD_STATE] + else: + self._cache[key] = self.NOT_CRAWLED + break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to fetch fingerprints") + pause = timeout.next() + if pause == None: + raise + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to fetch fingerprints") def frontier_start(self): pass @@ -270,8 +317,20 @@ def __init__(self, pool, delete_all_keys): self._pool = pool self._logger = logging.getLogger("redis_backend.metadata") if delete_all_keys: - connection = StrictRedis(connection_pool=self._pool) - connection.flushdb() + timeout = _get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + connection.flushdb() + break + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to flush database") + pause = timeout.next() + if pause == None: + raise + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to flush database") @classmethod def timestamp(cls): @@ -303,7 +362,6 @@ def add_seeds(self, seeds): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to add seeds") - raise def request_error(self, page, error): timeout = _get_retry_timeouts() @@ -328,7 +386,6 @@ def request_error(self, page, error): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write request error") - raise def page_crawled(self, response): timeout = _get_retry_timeouts() @@ -350,7 +407,6 @@ def page_crawled(self, response): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write page crawled status") - raise def links_extracted(self, _, links): timeout = _get_retry_timeouts() @@ -380,7 +436,6 @@ def links_extracted(self, _, links): sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write links extracted") - raise def frontier_start(self): pass From 06896a1c7d6c128201d486091f4f606d01fe4e86 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 29 May 2017 10:41:41 +0200 Subject: [PATCH 040/273] Refactor no fail on Redis error Change behaviour on Connection error to skip operation and continue. Bugfix handling of Response error. Now correctly continues instead of getting stuck. --- .../backends/redis_backend/__init__.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index f7abb54ed..06f024337 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -30,7 +30,7 @@ Error handling: * On Connection error: ** Retry three times with increasing timout. -** Fail and report error if the third retry fails. +** Skip the operation if the third retry fails. * On Response error: ** Report and continue. ** Reponse error is usually caused by Redis using all available memory. Ideally, Redis should have enough memory @@ -109,10 +109,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self._logger.exception("Connection to Redis failed when attempting to get more requests") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to get more requests") + break self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) @@ -138,10 +139,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self._logger.exception("Connection to Redis failed when attempting to remove scheduled items") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to remove scheduled items") + break return results def schedule(self, batch): @@ -198,17 +200,18 @@ def _schedule(self, batch, timestamp): self._logger.exception("Connection to Redis failed when attempting to schedule items") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to schedule items") + break def count(self): timeout = _get_retry_timeouts() while True: try: - connection = StrictRedis(connection_pool=self._pool) count = 0 + connection = StrictRedis(connection_pool=self._pool) for partition_id in self._partitions: count += connection.zcard(partition_id) break @@ -216,10 +219,11 @@ def count(self): self._logger.exception("Connection to Redis failed when attempting to count items") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to count items") + break return count def frontier_start(self): @@ -269,10 +273,11 @@ def flush(self, force_clear): self._logger.exception("Connection to Redis failed when attempting to flush cache") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to flush cache") + break if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -300,10 +305,11 @@ def fetch(self, fingerprints): self._logger.exception("Connection to Redis failed when attempting to fetch fingerprints") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to fetch fingerprints") + break def frontier_start(self): pass @@ -327,10 +333,11 @@ def __init__(self, pool, delete_all_keys): self._logger.exception("Connection to Redis failed when attempting to flush database") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to flush database") + break @classmethod def timestamp(cls): @@ -358,10 +365,11 @@ def add_seeds(self, seeds): self._logger.exception("Connection to Redis failed when attempting to add seeds") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to add seeds") + break def request_error(self, page, error): timeout = _get_retry_timeouts() @@ -382,10 +390,11 @@ def request_error(self, page, error): self._logger.exception("Connection to Redis failed when attempting to write request error") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write request error") + break def page_crawled(self, response): timeout = _get_retry_timeouts() @@ -403,10 +412,11 @@ def page_crawled(self, response): self._logger.exception("Connection to Redis failed when attempting to write page crawled status") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write page crawled status") + break def links_extracted(self, _, links): timeout = _get_retry_timeouts() @@ -432,10 +442,11 @@ def links_extracted(self, _, links): self._logger.exception("Connection to Redis failed when attempting to write links extracted") pause = timeout.next() if pause == None: - raise + break sleep(pause) except ResponseError as e: self._logger.exception("Writing to Redis failed when attempting to write links extracted") + break def frontier_start(self): pass From 1584f6107ebe6fae28f1ed1d0ff75e57b5c846d9 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 29 May 2017 10:46:04 +0200 Subject: [PATCH 041/273] Document update Add information about Redis backend to the frontier backends doc. --- docs/source/topics/frontier-backends.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 706da8377..92dc8608e 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -289,6 +289,19 @@ setting. Queue exploration shuffling with MR jobs +Redis backend +^^^^^^^^^^^^^ + +.. autoclass:: frontera.contrib.backends.redis_backend.RedisBackend + +This is similar to the HBase backend. It is suitable for large scale crawlers that still has a limited scope. It is +recommended to ensure Redis is allowed to use enough memory to store all metadata the crawler needs. In case of Redis +running out of memory, the crawler will log this and continue. When the crawler is unable to write metadata to the +database; that metadata is lost. + +In case of connection errors; the crawler will attempt to reconnect three times. If the third attempt at connecting +to Redis fails, the worker will skip that Redis operation and continue operating. + .. _FIFO: http://en.wikipedia.org/wiki/FIFO .. _LIFO: http://en.wikipedia.org/wiki/LIFO_(computing) .. _DFS: http://en.wikipedia.org/wiki/Depth-first_search @@ -298,3 +311,4 @@ setting. .. _SQLAlchemy: http://www.sqlalchemy.org/ .. _any databases supported by SQLAlchemy: http://docs.sqlalchemy.org/en/latest/dialects/index.html .. _declarative sqlalchemy models: http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/index.html + From 509771636b72fc1e6af690125db7f5bcbf4820fc Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 29 May 2017 19:07:59 +0200 Subject: [PATCH 042/273] marking test_queue_with_delay as xfail --- tests/contrib/backends/hbase/test_hbase.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index e0a039fd1..a6cc44b10 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -8,6 +8,7 @@ from time import time from w3lib.util import to_native_str from tests import mock +import pytest r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) @@ -51,6 +52,7 @@ def test_queue(self): assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r1.url, r2.url]) + @pytest.mark.xfail def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) queue = HBaseQueue(connection, 1, b'queue', True) From 984b99744300a7edd67d12c9305437690300c299 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Tue, 30 May 2017 08:43:09 +0200 Subject: [PATCH 043/273] Document update Add information that queue items might be lost if there are errors when attempting to write to Redis. --- docs/source/topics/frontier-backends.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 92dc8608e..2d4bc64cb 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -295,9 +295,9 @@ Redis backend .. autoclass:: frontera.contrib.backends.redis_backend.RedisBackend This is similar to the HBase backend. It is suitable for large scale crawlers that still has a limited scope. It is -recommended to ensure Redis is allowed to use enough memory to store all metadata the crawler needs. In case of Redis -running out of memory, the crawler will log this and continue. When the crawler is unable to write metadata to the -database; that metadata is lost. +recommended to ensure Redis is allowed to use enough memory to store all data the crawler needs. In case of Redis +running out of memory, the crawler will log this and continue. When the crawler is unable to write metadata or queue +items to the database; that metadata or queue items are lost. In case of connection errors; the crawler will attempt to reconnect three times. If the third attempt at connecting to Redis fails, the worker will skip that Redis operation and continue operating. From 93abdcae98adff272155e864e78ea5e6eb72e34b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 May 2017 10:07:52 +0200 Subject: [PATCH 044/273] fixing prefix string type --- frontera/contrib/backends/hbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 1b7e29c91..3766386b0 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -208,7 +208,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): count = 0 # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) # TODO: figure out how to use filter here, Thrift filter above causes full scan - for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): + for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=to_bytes(prefix)): for cq, buf in six.iteritems(data): if cq == b'f:t': continue From b30acbbb301de8b4db88e84f7e5ae63dbef2ab22 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 May 2017 10:50:38 +0200 Subject: [PATCH 045/273] style fix --- frontera/contrib/backends/hbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 3766386b0..fbbefb80d 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -196,7 +196,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries = 0 count = 0 prefix = '%d_' % partition_id - now_ts = int(time()) + #now_ts = int(time()) while tries < self.GET_RETRIES: tries += 1 From 4d6b492a4a68848c37f6b63e8f57678cc457d0de Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 31 May 2017 15:40:01 +0200 Subject: [PATCH 046/273] Refactor use Redis wrapper Add RedisOperations mixin for unified error and retry handling. Use RedisOperations in RedisQueue, RedisMetadata and RedisState. --- .../backends/redis_backend/__init__.py | 400 +++++++----------- 1 file changed, 152 insertions(+), 248 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 06f024337..79c29eed7 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -25,7 +25,6 @@ FIELD_STATUS_CODE = b'status_code' FIELD_URL = b'url' - """ Error handling: * On Connection error: @@ -38,13 +37,50 @@ Redis after a while. """ -# Timeout generator with backoff until 60 seconds -def _get_retry_timeouts(): - for timeout in [0, 10, 30]: yield timeout - yield None +class RedisOperations: + @classmethod + def _get_retry_timeouts(cls): + # Timeout generator with backoff until 60 seconds + for timeout in [0, 10, 30]: yield timeout + yield None + + def _redis_operation(self, message, operation, *args): + timeout = self._get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + return operation(connection, *args) + except ConnectionError as e: + self._logger.exception("Connection to Redis failed when attempting to {0}".format(message)) + pause = timeout.next() + if pause == None: + break + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to {0}".format(message)) + break -class RedisQueue(Queue): + def _redis_pipeline(self, message, operation, *args): + timeout = self._get_retry_timeouts() + while True: + try: + connection = StrictRedis(connection_pool=self._pool) + pipe = connection.pipeline() + operation(pipe, *args) + return pipe.execute() + except ConnectionError as e: + self._exception("Connection to Redis failed when attempting to {0}".format(message)) + pause = timeout.next() + if pause == None: + break + sleep(pause) + except ResponseError as e: + self._logger.exception("Writing to Redis failed when attempting to {0}".format(message)) + break + + +class RedisQueue(Queue, RedisOperations): MAX_SCORE = 1.0 MIN_SCORE = 0.0 SCORE_STEP = 0.01 @@ -62,8 +98,27 @@ def __init__(self, manager, pool, partitions, delete_all_keys=False): self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: - connection = StrictRedis(connection_pool=self._pool) - connection.flushdb() + self._redis_operation( "flushing db", lambda connection: connection.flushdb()) + + def _get_items(self, connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, max_n_requests, to_remove): + for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): + start += 1 + item = unpackb(data, use_list=False) + timestamp, fprint, host_crc32, _, score = item + if timestamp > now_ts: + continue + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + continue + queue[host_crc32].append(item) + if len(queue[host_crc32]) > max_host_items: + max_host_items = len(queue[host_crc32]) + count += 1 + to_remove.append(data) + if count >= max_n_requests: + break + return start, count, max_host_items def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ @@ -82,39 +137,12 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): last_start = -1 while count < max_n_requests and last_start < start: last_start = start - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): - start += 1 - item = unpackb(data, use_list=False) - timestamp, fprint, host_crc32, _, score = item - if timestamp > now_ts: - continue - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: - continue - queue[host_crc32].append(item) - if len(queue[host_crc32]) > max_host_items: - max_host_items = len(queue[host_crc32]) - count += 1 - to_remove.append(data) - - if count >= max_n_requests: - break - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to get more requests") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to get more requests") - break - + start, count, max_host_items = self._redis_operation( + "get more requests", + lambda connection: self._get_items( + connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, + max_n_requests, to_remove) + ) self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) results = [] @@ -129,21 +157,9 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): request.meta[FIELD_SCORE] = score results.append(request) if len(to_remove) > 0: - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - connection.zrem(partition_id, *to_remove) - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to remove scheduled items") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to remove scheduled items") - break + self._redis_operation( + "remove scheduled items", lambda connection: connection.zrem(partition_id, *to_remove) + ) return results def schedule(self, batch): @@ -187,44 +203,17 @@ def _schedule(self, batch, timestamp): item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for key, items in data.items(): - connection.zadd(key, *items) - pipe.execute() - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to schedule items") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to schedule items") - break + self._redis_pipeline( + "schedule items", + lambda pipe, data: map(lambda (key, items): pipe.zadd(key, *items), data.items()), data + ) def count(self): - timeout = _get_retry_timeouts() - while True: - try: - count = 0 - connection = StrictRedis(connection_pool=self._pool) - for partition_id in self._partitions: - count += connection.zcard(partition_id) - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to count items") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to count items") - break - return count + return self._redis_operation( + "count items", + lambda connection, partitions: sum(map(lambda partition_id: connection.zcard(partition_id), partitions)), + self._partitions + ) def frontier_start(self): pass @@ -233,7 +222,7 @@ def frontier_stop(self): pass -class RedisState(States): +class RedisState(States, RedisOperations): def __init__(self, pool, cache_size_limit): self._pool = pool self._cache = {} @@ -260,24 +249,11 @@ def get(obj): def flush(self, force_clear): if len(self._cache) > self._cache_size_limit: force_clear = True - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for fprint, state in self._cache.items(): - pipe.hmset(fprint, {FIELD_STATE: state}) - pipe.execute() - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to flush cache") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to flush cache") - break + self._redis_pipeline( + "flush cache", + lambda pipe, cache: map(lambda (fprint, state): pipe.hmset(fprint, {FIELD_STATE: state}), cache.items()), + self._cache + ) if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -286,30 +262,16 @@ def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self._logger.debug("cache size %s" % len(self._cache)) self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for key in to_fetch: - pipe.hgetall(key) - responses = pipe.execute() - for index, key in enumerate(to_fetch): - response = responses[index] - if len(response) > 0 and FIELD_STATE in response: - self._cache[key] = response[FIELD_STATE] - else: - self._cache[key] = self.NOT_CRAWLED - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to fetch fingerprints") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to fetch fingerprints") - break + responses = self._redis_pipeline( + "fetch fingerprints", + lambda pipe, to_fetch: map(lambda key: pipe.hgetall(key), to_fetch), to_fetch + ) + for index, key in enumerate(to_fetch): + response = responses[index] + if len(response) > 0 and FIELD_STATE in response: + self._cache[key] = response[FIELD_STATE] + else: + self._cache[key] = self.NOT_CRAWLED def frontier_start(self): pass @@ -318,135 +280,77 @@ def frontier_stop(self): self.flush(False) -class RedisMetadata(Metadata): +class RedisMetadata(Metadata, RedisOperations): def __init__(self, pool, delete_all_keys): self._pool = pool self._logger = logging.getLogger("redis_backend.metadata") if delete_all_keys: - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - connection.flushdb() - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to flush database") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to flush database") - break + self._redis_operation("flush database", lambda connection: connection.flushdb()) @classmethod def timestamp(cls): return str(datetime.utcnow().replace(microsecond=0)) + def _create_seed(self, seed): + return { + FIELD_URL: seed.url, + FIELD_DEPTH: 0, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + def add_seeds(self, seeds): - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - for seed in seeds: - pipe.hmset( - seed.meta[FIELD_FINGERPRINT], - { - FIELD_URL: seed.url, - FIELD_DEPTH: 0, - FIELD_CREATED_AT: self.timestamp(), - FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) - pipe.execute() - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to add seeds") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to add seeds") - break + self._redis_pipeline( + "add seeds", + lambda pipe, seeds: map( + lambda seed: pipe.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)), seeds), seeds + ) + + def _create_request_error(self, page, error): + return { + FIELD_URL: page.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_ERROR: error, + FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } def request_error(self, page, error): - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - connection.hmset( - page.meta[FIELD_FINGERPRINT], - { - FIELD_URL: page.url, - FIELD_CREATED_AT: self.timestamp(), - FIELD_ERROR: error, - FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to write request error") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to write request error") - break + self._redis_operation( + "write requests error", + lambda connection, page, error: connection.hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)), page, error + ) + + def _create_crawl_info(self, response): + return { + FIELD_STATUS_CODE: response.status_code + } def page_crawled(self, response): - timeout = _get_retry_timeouts() - while True: - try: - connection = StrictRedis(connection_pool=self._pool) - connection.hmset( - response.meta[FIELD_FINGERPRINT], - { - FIELD_STATUS_CODE: response.status_code - } - ) - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to write page crawled status") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to write page crawled status") - break + self._redis_operation( + "write page crawled status", + lambda connection, response: connection.hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)), response + ) + + def _create_link_extracted(self, link): + return { + FIELD_URL: link.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } def links_extracted(self, _, links): - timeout = _get_retry_timeouts() - while True: - try: - links_processed = set() - connection = StrictRedis(connection_pool=self._pool) - for link in links: - link_fingerprint = link.meta[FIELD_FINGERPRINT] - if link_fingerprint in links_processed: - continue - connection.hmset( - link_fingerprint, - { - FIELD_URL: link.url, - FIELD_CREATED_AT: self.timestamp(), - FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] - } - ) - links_processed.add(link_fingerprint) - break - except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to write links extracted") - pause = timeout.next() - if pause == None: - break - sleep(pause) - except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to write links extracted") - break + links_deduped = {} + for link in links: + link_fingerprint = link.meta[FIELD_FINGERPRINT] + if link_fingerprint in links_deduped: + continue + links_deduped[link_fingerprint] = link + self._redis_pipeline( + "write links extracted", + lambda pipe, links: map( + lambda (fingerprint, link): pipe.hmset(fingerprint, self._create_link_extracted(link)), links.items()), + links_deduped + ) def frontier_start(self): pass From 6dbd0d1e767956b14c5e7b56346ec816c420cbde Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Thu, 8 Jun 2017 10:29:16 +0200 Subject: [PATCH 047/273] Refactor Redis operations Add RedisOperation encapsulating error handling while allowing method calls similar to direct StrictRedis method calls. Add RedisPiepline encapsulating error handling while allowing method calls similar to direct StrictRedis method pipelining. This approach was suggested by Alex. --- .../backends/redis_backend/__init__.py | 137 +++++++++--------- .../backends/redis_backend/test_redis.py | 6 + 2 files changed, 74 insertions(+), 69 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 79c29eed7..b36b337b8 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -6,6 +6,7 @@ from frontera.core.components import Metadata, Queue, States from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.utils.misc import get_crc32, load_object +import functools import logging from msgpack import packb, unpackb from redis import ConnectionPool, StrictRedis @@ -38,49 +39,68 @@ """ -class RedisOperations: - @classmethod - def _get_retry_timeouts(cls): - # Timeout generator with backoff until 60 seconds - for timeout in [0, 10, 30]: yield timeout - yield None +def _get_retry_timeouts(): + # Timeout generator with backoff until 60 seconds + for timeout in [0, 10, 30]: yield timeout + yield None + + +class RedisOperation(object): + def __init__(self, pool): + self._connection = StrictRedis(connection_pool=pool) + self._logger = logging.getLogger("redis_backend.RedisOperation") + + def __getattr__(self, _api): + return functools.partial(self._redis_operation, _api) - def _redis_operation(self, message, operation, *args): - timeout = self._get_retry_timeouts() + def _redis_operation(self, _api, *args, **kwargs): + timeout = _get_retry_timeouts() while True: try: - connection = StrictRedis(connection_pool=self._pool) - return operation(connection, *args) + return getattr(self._connection, _api)(*args, **kwargs) except ConnectionError as e: - self._logger.exception("Connection to Redis failed when attempting to {0}".format(message)) + print('conn err') + self._logger.exception("Connection to Redis failed operation") pause = timeout.next() if pause == None: break sleep(pause) except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to {0}".format(message)) + self._logger.exception("Redis operation failed") break - def _redis_pipeline(self, message, operation, *args): - timeout = self._get_retry_timeouts() + +class RedisPipeline(object): + def __init__(self, pool): + self._connection = StrictRedis(connection_pool=pool) + self._pipeline = None + self._logger = logging.getLogger("redis_backend.RedisPipeline") + + def __getattr__(self, _api): + if not self._pipeline: + self._pipeline = self._connection.pipeline() + return getattr(self._pipeline, _api) + + def execute(self): + timeout = _get_retry_timeouts() + stack = self._pipeline.command_stack while True: try: - connection = StrictRedis(connection_pool=self._pool) - pipe = connection.pipeline() - operation(pipe, *args) - return pipe.execute() + return self._pipeline.execute() except ConnectionError as e: - self._exception("Connection to Redis failed when attempting to {0}".format(message)) + self._logger.exception("Connection to Redis failed when executing pipeline") pause = timeout.next() if pause == None: break sleep(pause) + self._pipeline.command_stack = stack except ResponseError as e: - self._logger.exception("Writing to Redis failed when attempting to {0}".format(message)) + self._logger.exception("Redis operation failed when executing pipeline") break -class RedisQueue(Queue, RedisOperations): + +class RedisQueue(Queue): MAX_SCORE = 1.0 MIN_SCORE = 0.0 SCORE_STEP = 0.01 @@ -98,7 +118,7 @@ def __init__(self, manager, pool, partitions, delete_all_keys=False): self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: - self._redis_operation( "flushing db", lambda connection: connection.flushdb()) + RedisOperation(self._pool).flushdb() def _get_items(self, connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, max_n_requests, to_remove): for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): @@ -135,14 +155,12 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): to_remove = [] start = 0 last_start = -1 + connection = RedisOperation(self._pool) while count < max_n_requests and last_start < start: last_start = start - start, count, max_host_items = self._redis_operation( - "get more requests", - lambda connection: self._get_items( + start, count, max_host_items = self._get_items( connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, max_n_requests, to_remove) - ) self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) results = [] @@ -157,9 +175,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): request.meta[FIELD_SCORE] = score results.append(request) if len(to_remove) > 0: - self._redis_operation( - "remove scheduled items", lambda connection: connection.zrem(partition_id, *to_remove) - ) + connection.zrem(partition_id, *to_remove) return results def schedule(self, batch): @@ -203,17 +219,14 @@ def _schedule(self, batch, timestamp): item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) - self._redis_pipeline( - "schedule items", - lambda pipe, data: map(lambda (key, items): pipe.zadd(key, *items), data.items()), data - ) + pipeline = RedisPipeline(self._pool) + for (key, items) in data.items(): + pipeline.zadd(key, *items), data.items() + pipeline.execute() def count(self): - return self._redis_operation( - "count items", - lambda connection, partitions: sum(map(lambda partition_id: connection.zcard(partition_id), partitions)), - self._partitions - ) + connection = RedisOperation(self._pool) + return sum([connection.zcard(partition_id) for partition_id in self._partitions]) def frontier_start(self): pass @@ -222,7 +235,7 @@ def frontier_stop(self): pass -class RedisState(States, RedisOperations): +class RedisState(States): def __init__(self, pool, cache_size_limit): self._pool = pool self._cache = {} @@ -249,11 +262,9 @@ def get(obj): def flush(self, force_clear): if len(self._cache) > self._cache_size_limit: force_clear = True - self._redis_pipeline( - "flush cache", - lambda pipe, cache: map(lambda (fprint, state): pipe.hmset(fprint, {FIELD_STATE: state}), cache.items()), - self._cache - ) + pipeline = RedisPipeline(self._pool) + [pipeline.hmset(fprint, {FIELD_STATE: state}) for (fprint, state) in self._cache.items()] + pipeline.execute() if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -262,10 +273,9 @@ def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self._logger.debug("cache size %s" % len(self._cache)) self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) - responses = self._redis_pipeline( - "fetch fingerprints", - lambda pipe, to_fetch: map(lambda key: pipe.hgetall(key), to_fetch), to_fetch - ) + pipeline = RedisPipeline(self._pool) + [pipeline.hgetall(key) for key in to_fetch] + responses = pipeline.execute() for index, key in enumerate(to_fetch): response = responses[index] if len(response) > 0 and FIELD_STATE in response: @@ -280,12 +290,12 @@ def frontier_stop(self): self.flush(False) -class RedisMetadata(Metadata, RedisOperations): +class RedisMetadata(Metadata): def __init__(self, pool, delete_all_keys): self._pool = pool self._logger = logging.getLogger("redis_backend.metadata") if delete_all_keys: - self._redis_operation("flush database", lambda connection: connection.flushdb()) + RedisOperation(self._pool).flushdb() @classmethod def timestamp(cls): @@ -300,11 +310,9 @@ def _create_seed(self, seed): } def add_seeds(self, seeds): - self._redis_pipeline( - "add seeds", - lambda pipe, seeds: map( - lambda seed: pipe.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)), seeds), seeds - ) + pipeline = RedisPipeline(self._pool) + [pipeline.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)) for seed in seeds] + pipeline.execute() def _create_request_error(self, page, error): return { @@ -315,10 +323,7 @@ def _create_request_error(self, page, error): } def request_error(self, page, error): - self._redis_operation( - "write requests error", - lambda connection, page, error: connection.hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)), page, error - ) + RedisOperation(self._pool).hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)) def _create_crawl_info(self, response): return { @@ -326,10 +331,7 @@ def _create_crawl_info(self, response): } def page_crawled(self, response): - self._redis_operation( - "write page crawled status", - lambda connection, response: connection.hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)), response - ) + RedisOperation(self._pool).hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)) def _create_link_extracted(self, link): return { @@ -345,12 +347,9 @@ def links_extracted(self, _, links): if link_fingerprint in links_deduped: continue links_deduped[link_fingerprint] = link - self._redis_pipeline( - "write links extracted", - lambda pipe, links: map( - lambda (fingerprint, link): pipe.hmset(fingerprint, self._create_link_extracted(link)), links.items()), - links_deduped - ) + pipeline = RedisPipeline(self._pool) + [pipeline.hmset(fingerprint, self._create_link_extracted(link)) for (fingerprint, link) in links_deduped.items()] + pipeline.execute() def frontier_start(self): pass diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 34fee6421..daa3c4335 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -9,6 +9,10 @@ from time import time from unittest import main, TestCase +from logging import basicConfig, INFO + +basicConfig(level=INFO) + class Request: def __init__(self, fingerprint, crawl_at, url, domain=None): @@ -393,6 +397,7 @@ def test_links_extracted(self): self.assertEqual(b'https://www.hellan.me/', connection.hmget("l3", FIELD_URL)[0]) self.assertEqual(b'd_l3', connection.hmget('l3', FIELD_DOMAIN_FINGERPRINT)[0]) + class RedisBackendTest(TestCase): @staticmethod def setup_subject(partitions): @@ -417,5 +422,6 @@ def test_get_next_request_has_requests(self): requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) self.assertEqual(3, len(requests)) + if __name__ == '__main__': main() From c3d8ad9b3ea2977cdf6d547a286647887df6f62f Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Wed, 14 Jun 2017 12:50:25 +0200 Subject: [PATCH 048/273] Refactor reuse objects Reuse RedisOperation and RedisPipeline in the main classes. Reuse pipeline in the RedisPipeline class. This reduces overhead and memory consumption. --- .../backends/redis_backend/__init__.py | 64 +++++++++---------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index b36b337b8..e930b38a9 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -72,13 +72,10 @@ def _redis_operation(self, _api, *args, **kwargs): class RedisPipeline(object): def __init__(self, pool): - self._connection = StrictRedis(connection_pool=pool) - self._pipeline = None + self._pipeline = self._connection.pipeline() self._logger = logging.getLogger("redis_backend.RedisPipeline") def __getattr__(self, _api): - if not self._pipeline: - self._pipeline = self._connection.pipeline() return getattr(self._pipeline, _api) def execute(self): @@ -99,7 +96,6 @@ def execute(self): break - class RedisQueue(Queue): MAX_SCORE = 1.0 MIN_SCORE = 0.0 @@ -112,16 +108,18 @@ def __init__(self, manager, pool, partitions, delete_all_keys=False): decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(manager.request_model) self._decoder = decoder_cls(manager.request_model, manager.response_model) - self._pool = pool + self._redis = RedisOperation(pool) + self._redis_pipeline = RedisPipeline(pool) self._partitions = [i for i in range(0, partitions)] self._partitioner = Crc32NamePartitioner(self._partitions) self._logger = logging.getLogger("redis_backend.queue") if delete_all_keys: - RedisOperation(self._pool).flushdb() + self._redis.flushdb() - def _get_items(self, connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, max_n_requests, to_remove): - for data in connection.zrevrange(partition_id, start=start, end=max_n_requests + start): + def _get_items(self, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, + max_n_requests, to_remove): + for data in self._redis.zrevrange(partition_id, start=start, end=max_n_requests + start): start += 1 item = unpackb(data, use_list=False) timestamp, fprint, host_crc32, _, score = item @@ -155,12 +153,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): to_remove = [] start = 0 last_start = -1 - connection = RedisOperation(self._pool) while count < max_n_requests and last_start < start: last_start = start start, count, max_host_items = self._get_items( - connection, partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, - max_n_requests, to_remove) + partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, + max_n_requests, to_remove) self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) results = [] @@ -175,7 +172,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): request.meta[FIELD_SCORE] = score results.append(request) if len(to_remove) > 0: - connection.zrem(partition_id, *to_remove) + self._redis.zrem(partition_id, *to_remove) return results def schedule(self, batch): @@ -219,14 +216,12 @@ def _schedule(self, batch, timestamp): item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) - pipeline = RedisPipeline(self._pool) for (key, items) in data.items(): - pipeline.zadd(key, *items), data.items() - pipeline.execute() + self._redis_pipeline.zadd(key, *items), data.items() + self._redis_pipeline.execute() def count(self): - connection = RedisOperation(self._pool) - return sum([connection.zcard(partition_id) for partition_id in self._partitions]) + return sum([self._redis.zcard(partition_id) for partition_id in self._partitions]) def frontier_start(self): pass @@ -237,7 +232,8 @@ def frontier_stop(self): class RedisState(States): def __init__(self, pool, cache_size_limit): - self._pool = pool + self._redis = RedisOperation(pool) + self._redis_pipeline = RedisPipeline(pool) self._cache = {} self._cache_size_limit = cache_size_limit self._logger = logging.getLogger("redis_backend.states") @@ -262,9 +258,8 @@ def get(obj): def flush(self, force_clear): if len(self._cache) > self._cache_size_limit: force_clear = True - pipeline = RedisPipeline(self._pool) - [pipeline.hmset(fprint, {FIELD_STATE: state}) for (fprint, state) in self._cache.items()] - pipeline.execute() + [self._redis_pipeline.hmset(fprint, {FIELD_STATE: state}) for (fprint, state) in self._cache.items()] + self._redis_pipeline.execute() if force_clear: self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) self._cache.clear() @@ -273,9 +268,8 @@ def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self._logger.debug("cache size %s" % len(self._cache)) self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) - pipeline = RedisPipeline(self._pool) - [pipeline.hgetall(key) for key in to_fetch] - responses = pipeline.execute() + [self._redis_pipeline.hgetall(key) for key in to_fetch] + responses = self._redis_pipeline.execute() for index, key in enumerate(to_fetch): response = responses[index] if len(response) > 0 and FIELD_STATE in response: @@ -292,10 +286,11 @@ def frontier_stop(self): class RedisMetadata(Metadata): def __init__(self, pool, delete_all_keys): - self._pool = pool + self._redis = RedisOperation(pool) + self._redis_pipeline = RedisPipeline(pool) self._logger = logging.getLogger("redis_backend.metadata") if delete_all_keys: - RedisOperation(self._pool).flushdb() + self._redis.flushdb() @classmethod def timestamp(cls): @@ -310,9 +305,8 @@ def _create_seed(self, seed): } def add_seeds(self, seeds): - pipeline = RedisPipeline(self._pool) - [pipeline.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)) for seed in seeds] - pipeline.execute() + [self._redis_pipeline.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)) for seed in seeds] + self._redis_pipeline.execute() def _create_request_error(self, page, error): return { @@ -323,7 +317,7 @@ def _create_request_error(self, page, error): } def request_error(self, page, error): - RedisOperation(self._pool).hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)) + self._redis.hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)) def _create_crawl_info(self, response): return { @@ -331,7 +325,7 @@ def _create_crawl_info(self, response): } def page_crawled(self, response): - RedisOperation(self._pool).hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)) + self._redis.hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)) def _create_link_extracted(self, link): return { @@ -347,9 +341,9 @@ def links_extracted(self, _, links): if link_fingerprint in links_deduped: continue links_deduped[link_fingerprint] = link - pipeline = RedisPipeline(self._pool) - [pipeline.hmset(fingerprint, self._create_link_extracted(link)) for (fingerprint, link) in links_deduped.items()] - pipeline.execute() + [self._redis_pipeline.hmset(fingerprint, self._create_link_extracted(link)) for (fingerprint, link) in + links_deduped.items()] + self._redis_pipeline.execute() def frontier_start(self): pass From dbbff1f061fd120175b66b9a35d926b7b5b9889f Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 23 Jun 2017 12:34:46 +0200 Subject: [PATCH 049/273] Bugfix RedisPipeline Add creation of redis operation before attempting to create pipeline. This prevents the recurring error caused by trying to access an uninstantiated member variable. --- frontera/contrib/backends/redis_backend/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index e930b38a9..d8a8de8e8 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -72,7 +72,8 @@ def _redis_operation(self, _api, *args, **kwargs): class RedisPipeline(object): def __init__(self, pool): - self._pipeline = self._connection.pipeline() + connection = StrictRedis(connection_pool=pool) + self._pipeline = connection.pipeline() self._logger = logging.getLogger("redis_backend.RedisPipeline") def __getattr__(self, _api): From f7cda7638ae96264f12089fc0a02955c0fc2f833 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 23 Jun 2017 13:04:46 +0200 Subject: [PATCH 050/273] Style corrections Remove unused variables. Fix typos. Change == None to is None. --- .../backends/redis_backend/__init__.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index d8a8de8e8..b7cf339c5 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -29,19 +29,20 @@ """ Error handling: * On Connection error: -** Retry three times with increasing timout. +** Retry three times with increasing timeout. ** Skip the operation if the third retry fails. * On Response error: ** Report and continue. -** Reponse error is usually caused by Redis using all available memory. Ideally, Redis should have enough memory +** Response error is usually caused by Redis using all available memory. Ideally, Redis should have enough memory for this not to happen. Still, if Redis is full, the rest of the crawler may continue and free up some space in Redis after a while. """ def _get_retry_timeouts(): - # Timeout generator with backoff until 60 seconds - for timeout in [0, 10, 30]: yield timeout + # Timeout generator with back off until 60 seconds + for timeout in [0, 10, 30]: + yield timeout yield None @@ -58,14 +59,14 @@ def _redis_operation(self, _api, *args, **kwargs): while True: try: return getattr(self._connection, _api)(*args, **kwargs) - except ConnectionError as e: + except ConnectionError: print('conn err') self._logger.exception("Connection to Redis failed operation") pause = timeout.next() - if pause == None: + if pause is None: break sleep(pause) - except ResponseError as e: + except ResponseError: self._logger.exception("Redis operation failed") break @@ -85,14 +86,14 @@ def execute(self): while True: try: return self._pipeline.execute() - except ConnectionError as e: + except ConnectionError: self._logger.exception("Connection to Redis failed when executing pipeline") pause = timeout.next() - if pause == None: + if pause is None: break sleep(pause) self._pipeline.command_stack = stack - except ResponseError as e: + except ResponseError: self._logger.exception("Redis operation failed when executing pipeline") break @@ -320,7 +321,8 @@ def _create_request_error(self, page, error): def request_error(self, page, error): self._redis.hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)) - def _create_crawl_info(self, response): + @staticmethod + def _create_crawl_info(response): return { FIELD_STATUS_CODE: response.status_code } From d3ce641b6c2650e43826d426eaa632d3f6c94237 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 23 Jun 2017 13:22:09 +0200 Subject: [PATCH 051/273] Code cleanup Remove trailing whitespace in comments. --- frontera/contrib/backends/redis_backend/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index b7cf339c5..95880627c 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -29,12 +29,12 @@ """ Error handling: * On Connection error: -** Retry three times with increasing timeout. +** Retry three times with increasing timeout. ** Skip the operation if the third retry fails. * On Response error: ** Report and continue. ** Response error is usually caused by Redis using all available memory. Ideally, Redis should have enough memory - for this not to happen. Still, if Redis is full, the rest of the crawler may continue and free up some space in + for this not to happen. Still, if Redis is full, the rest of the crawler may continue and free up some space in Redis after a while. """ From b918da6bef68ce199ddb24d0ae50c13d08280837 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 7 Jul 2017 09:23:46 +0200 Subject: [PATCH 052/273] Code Cleanup Remove print statement that was added for test purposes. --- frontera/contrib/backends/redis_backend/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 95880627c..9fd78ca09 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -60,7 +60,6 @@ def _redis_operation(self, _api, *args, **kwargs): try: return getattr(self._connection, _api)(*args, **kwargs) except ConnectionError: - print('conn err') self._logger.exception("Connection to Redis failed operation") pause = timeout.next() if pause is None: From 3088729a51d2bb304dd3f61c2f25643d5b5a2795 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 7 Jul 2017 12:28:41 +0200 Subject: [PATCH 053/273] Refactor and support min_hosts Added support for min_hosts in RedisQueue get_next_requests. Added testing of min_hosts in RedisQueue get_next_requests. Added testing of max_requests_per_host in RedisQueue get_next_requests. Added testing of max_requests in RedisQueue get_next_requests. Refactored loops in RedisQueue get_next_requests for a slight performance(speed) improvement. --- .../backends/redis_backend/__init__.py | 18 ++--- .../backends/redis_backend/test_redis.py | 65 +++++++++++++++++++ 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 9fd78ca09..cd3035770 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -128,7 +128,7 @@ def _get_items(self, partition_id, start, now_ts, queue, max_requests_per_host, continue if host_crc32 not in queue: queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + if max_requests_per_host is not None and len(queue[host_crc32]) >= max_requests_per_host: continue queue[host_crc32].append(item) if len(queue[host_crc32]) > max_host_items: @@ -144,9 +144,12 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): Fetch new batch from priority queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from + :param min_hosts: minimum number of hosts + :param max_requests_per_host: maximum number of requests per host :return: list of :class:`Request ` objects. """ max_requests_per_host = kwargs.pop('max_requests_per_host') + min_hosts = kwargs.pop('min_hosts') queue = {} count = 0 now_ts = int(time()) @@ -154,19 +157,18 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): to_remove = [] start = 0 last_start = -1 - while count < max_n_requests and last_start < start: + while (count < max_n_requests or len(queue) < min_hosts) and last_start < start : last_start = start - start, count, max_host_items = self._get_items( + start, subset_count, max_host_items = self._get_items( partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, max_n_requests, to_remove) + count += subset_count + self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) results = [] - for i in range(max_host_items): - for host_crc32, items in queue.items(): - if len(items) <= i: - continue - item = items[i] + for host_crc32, items in queue.items(): + for item in items: (_, _, _, encoded, score) = item to_remove.append(packb(item)) request = self._decoder.decode_request(encoded) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index daa3c4335..90f0d99e9 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -236,6 +236,71 @@ def test_scheduling_conflict_high_score_high_timestamp(self): self.assertTrue('https://www.khellan.com/' in urls) self.assertEqual(5, subject.count()) + def test_get_next_requests_max_requests(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + def test_get_next_requests_min_hosts(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=2, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + def test_get_next_requests_min_hosts_high_number(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=5, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + def test_get_next_requests_max_requests(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("1", 0.99, Request("1", int(time()) - 10, 'https://www.knuthellan.com/a', domain='knuthellan.com'), True), + ("1", 0.98, Request("1", int(time()) - 10, 'https://www.knuthellan.com/c', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(5, subject.count()) + requests = subject.get_next_requests(5, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=2) + self.assertGreaterEqual(len(requests), 2) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.knuthellan.com/a' in urls) + self.assertFalse('https://www.knuthellan.com/c' in urls) + class RedisStateTest(TestCase): def test_update_cache(self): From 4562344db17a8d98d6e5eb41b498966a70b3d240 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Fri, 7 Jul 2017 12:48:32 +0200 Subject: [PATCH 054/273] Code cleanup Removed unneeded whitespace. --- frontera/contrib/backends/redis_backend/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index cd3035770..9fadca6ad 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -157,7 +157,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): to_remove = [] start = 0 last_start = -1 - while (count < max_n_requests or len(queue) < min_hosts) and last_start < start : + while (count < max_n_requests or len(queue) < min_hosts) and last_start < start: last_start = start start, subset_count, max_host_items = self._get_items( partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, From dd4d155f55326a0595ea611c9e0d375af10962c9 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 28 Aug 2017 09:47:20 +0200 Subject: [PATCH 055/273] Added test Test correct handling if there are fewer items in the queue than max_n_requests and number of hosts is less than min_hosts. --- tests/contrib/backends/redis_backend/test_redis.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 90f0d99e9..adbc6ceb4 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -301,6 +301,20 @@ def test_get_next_requests_max_requests(self): self.assertTrue('https://www.knuthellan.com/a' in urls) self.assertFalse('https://www.knuthellan.com/c' in urls) + def test_get_next_requests_few_items_few_hosts(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True) + ] + subject.schedule(batch) + self.assertEqual(1, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=2, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(0, subject.count()) + + class RedisStateTest(TestCase): def test_update_cache(self): From 7001d905319ad7ffff26cd143eda1ee8469c2458 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 9 Mar 2017 12:02:35 +0100 Subject: [PATCH 056/273] large internal patch: headers an redirects persistence, cityhash64 partitioning, str types in msgpack, async offsetsfetcher for kafka, graceful shutdown fix in both workers, SSL connection support for kafka message bus --- docs/source/topics/frontera-settings.rst | 31 ++++++- examples/cluster/bc/spiders/bc.py | 1 + frontera/contrib/backends/hbase.py | 48 ++++++---- frontera/contrib/backends/partitioners.py | 8 +- .../contrib/backends/remote/codecs/msgpack.py | 5 +- frontera/contrib/messagebus/kafka/__init__.py | 76 +++++++++++++++- frontera/contrib/messagebus/kafka/async.py | 10 ++- frontera/contrib/messagebus/kafkabus.py | 88 +++++++++++++++---- frontera/core/__init__.py | 42 +++++++-- frontera/settings/default_settings.py | 5 +- frontera/utils/fingerprint.py | 13 +-- frontera/utils/ossignal.py | 17 ++++ frontera/worker/db.py | 44 +++++++++- frontera/worker/strategy.py | 55 +++++++++--- requirements.txt | 1 + setup.py | 3 +- 16 files changed, 373 insertions(+), 74 deletions(-) create mode 100644 frontera/utils/ossignal.py diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 805fc52f7..559b29738 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -350,6 +350,16 @@ Default: ``False`` Determines if content should be sent over the message bus and stored in the backend: a serious performance killer. +.. setting:: SW_FLUSH_INTERVAL + +SW_FLUSH_INTERVAL +----------------- + +Default: ``900`` + +Mean interval between flushing of states in :term:`strategy worker`. Selected randomly using formula +SW_FLUSH_INTERVAL + RANDINT(-SW_FLUSH_INTERVAL/2, SW_FLUSH_INTERVAL/2) + .. setting:: TEST_MODE TEST_MODE @@ -632,11 +642,28 @@ Hostname and port of kafka broker, separated with :. Can be a string with hostna KAFKA_CODEC ----------- -Default: ``None`` +Default: ``KAFKA_CODEC`` -Kafka-python 1.0.x version compression codec to use, is a string or None and could be one of ``snappy``, ``gzip`` or +Kafka-python 1.0.x version compression codec to use, is a string and could be one of ``none``, ``snappy``, ``gzip`` or ``lz4``. + +.. setting:: KAFKA_CERT_PATH + +KAFKA_CERT_PATH +--------------- + +OS path to the folder with three certificate files: ca-cert.pem, client-cert.pem, client-key.pem. + + +.. setting:: KAFKA_ENABLE_SSL + +KAFKA_ENABLE_SSL +---------------- + +Boolean. Set to True to enable SSL connection in Kafka client. + + .. setting:: SPIDER_LOG_DBW_GROUP SPIDER_LOG_DBW_GROUP diff --git a/examples/cluster/bc/spiders/bc.py b/examples/cluster/bc/spiders/bc.py index ac3ee3bf3..98914e372 100644 --- a/examples/cluster/bc/spiders/bc.py +++ b/examples/cluster/bc/spiders/bc.py @@ -4,6 +4,7 @@ from scrapy.http.response.html import HtmlResponse from scrapy.linkextractors import LinkExtractor from scrapy import signals +from scrapy.exceptions import DontCloseSpider class BCSpider(Spider): name = 'bc' diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index fbbefb80d..145591700 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -9,7 +9,7 @@ from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder from happybase import Connection -from msgpack import Unpacker, Packer +from msgpack import Unpacker, Packer, packb import six from six.moves import range from w3lib.util import to_bytes @@ -32,9 +32,11 @@ 'status_code': lambda x: pack('>H', x), 'state': lambda x: pack('>B', x), 'error': to_bytes, - 'domain_fingerprint': to_bytes, + 'domain_fprint': to_bytes, 'score': lambda x: pack('>f', x), - 'content': to_bytes + 'content': to_bytes, + 'headers': packb, + 'dest_fprint': to_bytes } @@ -79,7 +81,7 @@ def __init__(self, connection, partitions, table_name, drop=False): tables.remove(self.table_name) if self.table_name not in tables: - self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) + self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'compression': 'SNAPPY'}}) class DumbResponse: pass @@ -196,8 +198,9 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries = 0 count = 0 prefix = '%d_' % partition_id - #now_ts = int(time()) - + now_ts = int(time()) + # TODO: figure out how to use filter here, Thrift filter above causes full scan + filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 @@ -206,9 +209,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) - # TODO: figure out how to use filter here, Thrift filter above causes full scan - for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=to_bytes(prefix)): + for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): # filter=filter for cq, buf in six.iteritems(data): if cq == b'f:t': continue @@ -357,12 +358,21 @@ def add_seeds(self, seeds): obj = prepare_hbase_object(url=seed.url, depth=0, created_at=utcnow_timestamp(), - domain_fingerprint=seed.meta[b'domain'][b'fingerprint']) + domain_fprint=seed.meta[b'domain'][b'fingerprint']) self.batch.put(unhexlify(seed.meta[b'fingerprint']), obj) def page_crawled(self, response): - obj = prepare_hbase_object(status_code=response.status_code, content=response.body) if self.store_content else \ - prepare_hbase_object(status_code=response.status_code) + headers = response.headers + redirect_urls = response.request.meta.get(b'redirect_urls') + redirect_fprints = response.request.meta.get(b'redirect_fingerprints') + if redirect_urls: + for url, fprint in zip(redirect_urls, redirect_fprints): + obj = prepare_hbase_object(url=url, + created_at=utcnow_timestamp(), + dest_fprint=redirect_fprints[-1]) + self.batch.put(fprint, obj) + obj = prepare_hbase_object(status_code=response.status_code, headers=headers, content=response.body) if self.store_content else \ + prepare_hbase_object(status_code=response.status_code, headers=headers) self.batch.put(unhexlify(response.meta[b'fingerprint']), obj) def links_extracted(self, request, links): @@ -372,16 +382,22 @@ def links_extracted(self, request, links): for link_fingerprint, (link, link_url, link_domain) in six.iteritems(links_dict): obj = prepare_hbase_object(url=link_url, created_at=utcnow_timestamp(), - domain_fingerprint=link_domain[b'fingerprint']) + domain_fprint=link_domain[b'fingerprint']) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): obj = prepare_hbase_object(url=request.url, created_at=utcnow_timestamp(), error=error, - domain_fingerprint=request.meta[b'domain'][b'fingerprint']) + domain_fprint=request.meta[b'domain'][b'fingerprint']) rk = unhexlify(request.meta[b'fingerprint']) self.batch.put(rk, obj) + if b'redirect_urls' in request.meta: + for url, fprint in zip(request.meta[b'redirect_urls'], request.meta[b'redirect_fingerprints']): + obj = prepare_hbase_object(url=url, + created_at=utcnow_timestamp(), + dest_fprint=request.meta[b'redirect_fingerprints'][-1]) + self.batch.put(fprint, obj) def update_score(self, batch): if not isinstance(batch, dict): @@ -412,13 +428,15 @@ def __init__(self, manager): 'host': host, 'port': int(port), 'table_prefix': namespace, - 'table_prefix_separator': ':' + 'table_prefix_separator': ':', + 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) + self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None diff --git a/frontera/contrib/backends/partitioners.py b/frontera/contrib/backends/partitioners.py index 5b425c20e..b038000f6 100644 --- a/frontera/contrib/backends/partitioners.py +++ b/frontera/contrib/backends/partitioners.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from struct import unpack -from binascii import unhexlify from frontera.core.components import Partitioner +from cityhash import CityHash64 from frontera.utils.misc import get_crc32 @@ -27,9 +26,8 @@ class FingerprintPartitioner(Partitioner): def partition(self, key, partitions=None): if not partitions: partitions = self.partitions - digest = unhexlify(key[0:2] + key[5:7] + key[10:12] + key[15:17]) - value = unpack(" self._max_per_key: + self._log.warning("Purging of key %s, of size %d has started", key, + len(pending)) + purged = 0 + while len(pending) > self._keep_per_key: + pending.popleft() + purged += 1 + self._log.warning("%d requests purged", purged) + + def _check_and_purge_keys(self): + if len(self._pending) > self._max_keys: + self._log.warning("Purging the keys") + new_keys = set(sample(self._pending.keys(), self._keep_keys)) + keys = set(self._pending.keys()) + while keys: + key = keys.pop() + if key not in new_keys: + del self._pending[key] + self._log.warning("Finished purging of keys") + def get_next_requests(self, max_n_requests, **kwargs): + if self._log.isEnabledFor(DEBUG): + self._log.debug("Overused keys: %s", str(kwargs['overused_keys'])) + self._log.debug("Pending: %i", (sum([len(pending) for pending in six.itervalues(self._pending)]))) + self._check_and_purge_keys() overused_set = set(kwargs['overused_keys']) requests = list(self._get_pending(max_n_requests, overused_set)) @@ -71,6 +101,8 @@ def get_next_requests(self, max_n_requests, **kwargs): key = self._get_key(request, kwargs['key_type']) if key in overused_set: self._pending[key].append(request) + # contacts-crawler strategy related hack + self._check_and_purge(key) else: requests.append(request) return requests diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 3bf72a003..c0e8797ff 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -58,6 +58,7 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False +SW_FLUSH_INTERVAL = 900 TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' @@ -80,4 +81,6 @@ SCORING_LOG_DBW_GROUP = "dbw-scoring-log" SPIDER_FEED_GROUP = "fetchers-spider-feed" -KAFKA_CODEC = None \ No newline at end of file +KAFKA_CODEC = None +KAFKA_CERT_PATH = '/mnt/mesos/sandbox' +KAFKA_ENABLE_SSL = False \ No newline at end of file diff --git a/frontera/utils/fingerprint.py b/frontera/utils/fingerprint.py index b491ebcb6..17e65f21f 100644 --- a/frontera/utils/fingerprint.py +++ b/frontera/utils/fingerprint.py @@ -26,11 +26,12 @@ def hostname_local_fingerprint(key): :return: str 20 bytes hex string """ result = parse_url(key) - hostname = result.hostname if result.hostname else '-' - host_checksum = get_crc32(hostname) - combined = hostname+result.path+';'+result.params+result.query+result.fragment + if not result.hostname: + return sha1(key) + host_checksum = get_crc32(result.hostname) + doc_uri_combined = result.path+';'+result.params+result.query+result.fragment - combined = to_bytes(combined, 'utf8', 'ignore') - doc_fprint = hashlib.md5(combined).digest() + doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') + doc_fprint = hashlib.md5(doc_uri_combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) - return fprint \ No newline at end of file + return fprint diff --git a/frontera/utils/ossignal.py b/frontera/utils/ossignal.py new file mode 100644 index 000000000..283a98a69 --- /dev/null +++ b/frontera/utils/ossignal.py @@ -0,0 +1,17 @@ +import signal +from twisted.internet import reactor + + +def install_shutdown_handlers(function, override_sigint=True): + """Install the given function as a signal handler for all common shutdown + signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the + SIGINT handler won't be install if there is already a handler in place + (e.g. Pdb) + """ + signal.signal(signal.SIGTERM, function) + if signal.getsignal(signal.SIGINT) == signal.default_int_handler or \ + override_sigint: + signal.signal(signal.SIGINT, function) + # Catch Ctrl-Break in windows + if hasattr(signal, "SIGBREAK"): + signal.signal(signal.SIGBREAK, function) \ No newline at end of file diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 6f9abad85..45a0ecd62 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -9,6 +9,7 @@ from os.path import exists from twisted.internet import reactor, task +from twisted.internet.defer import Deferred from frontera.core.components import DistributedBackend from frontera.core.manager import FrontierManager from frontera.utils.url import parse_domain_from_url_fast @@ -17,6 +18,7 @@ from frontera.settings import Settings from frontera.utils.misc import load_object from frontera.utils.async import CallLaterOnce +from frontera.utils.ossignal import install_shutdown_handlers from .server import WorkerJsonRpcService import six from six.moves import map @@ -60,6 +62,12 @@ def schedule(self, on_start=False): self.scoring_consumption.schedule() self.scheduling.schedule(5.0) + def cancel(self): + self.scheduling.cancel() + self.scoring_consumption.cancel() + self.new_batch.cancel() + self.consumption.cancel() + class DBWorker(object): def __init__(self, settings, no_batches, no_incoming, no_scoring): @@ -110,13 +118,43 @@ def debug(sig, frame): self.slot.schedule(on_start=True) self._logging_task.start(30) + install_shutdown_handlers(self._handle_shutdown) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) - reactor.run() + reactor.run(installSignalHandlers=False) + + def _handle_shutdown(self, signum, _): + def call_shutdown(): + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) + + logger.info("Received shutdown signal %d, shutting down gracefully.", signum) + reactor.callFromThread(call_shutdown) + + def stop_tasks(self): + logger.info("Stopping periodic tasks.") + self._logging_task.stop() + self.slot.cancel() + + d = Deferred() + d.addBoth(self._perform_shutdown) + d.addBoth(self._stop_reactor) + return d + + def _stop_reactor(self, _=None): + logger.info("Stopping reactor.") + try: + reactor.stop() + except RuntimeError: # raised if already stopped or in shutdown stage + pass - def stop(self): + def _perform_shutdown(self, _=None): logger.info("Stopping frontier manager.") self._manager.stop() + logger.info("Closing message bus.") + if not self.strategy_disabled: + self.scoring_log_consumer.close() + self.spider_feed_producer.close() + self.spider_log_consumer.close() def log_status(self): for k, v in six.iteritems(self.stats): diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index a33008ca3..14759cf3d 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -7,12 +7,15 @@ from logging.config import fileConfig from argparse import ArgumentParser from os.path import exists +from random import randint from frontera.utils.misc import load_object +from frontera.utils.ossignal import install_shutdown_handlers from frontera.core.manager import FrontierManager from frontera.logger.handlers import CONSOLE from twisted.internet.task import LoopingCall from twisted.internet import reactor +from twisted.internet.defer import Deferred from frontera.settings import Settings from collections import Iterable @@ -55,7 +58,7 @@ def __init__(self, states): def to_fetch(self, requests): if isinstance(requests, Iterable): - self._fingerprints.update(x.meta[b'fingerprint'] for x in requests) + self._fingerprints.update([x.meta[b'fingerprint'] for x in requests]) return self._fingerprints.add(requests.meta[b'fingerprint']) @@ -112,6 +115,8 @@ def __init__(self, settings, strategy_class): self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) + flush_interval = settings.get("SW_FLUSH_INTERVAL") + self._flush_interval = flush_interval + randint(-flush_interval / 2, flush_interval / 2) logger.info("Strategy worker is initialized and consuming partition %d", partition_id) def collect_unknown_message(self, msg): @@ -205,10 +210,9 @@ def work(self): # Exiting, if crawl is finished if self.strategy.finished(): logger.info("Successfully reached the crawling goal.") - logger.info("Closing crawling strategy.") - self.strategy.close() logger.info("Finishing.") - reactor.callFromThread(reactor.stop) + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() @@ -219,11 +223,9 @@ def log_failure(failure): logger.exception(failure.value) if failure.frames: logger.critical(str("").join(format_tb(failure.getTracebackObject()))) - def errback_main(failure): log_failure(failure) self.task.start(interval=0).addErrback(errback_main) - def errback_flush_states(failure): log_failure(failure) self._flush_states_task.start(interval=300).addErrback(errback_flush_states) @@ -232,12 +234,12 @@ def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) + install_shutdown_handlers(self._handle_shutdown) self.task.start(interval=0).addErrback(errback_main) self._logging_task.start(interval=30) - self._flush_states_task.start(interval=300).addErrback(errback_flush_states) + self._flush_states_task.start(interval=self._flush_interval).addErrback(errback_flush_states) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) - reactor.run() + reactor.run(installSignalHandlers=False) def log_status(self): for k, v in six.iteritems(self.stats): @@ -246,11 +248,44 @@ def log_status(self): def flush_states(self): self.states_context.flush() - def stop(self): + def _handle_shutdown(self, signum, _): + def call_shutdown(): + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) + + logger.info("Received shutdown signal %d, shutting down gracefully.", signum) + reactor.callFromThread(call_shutdown) + + def stop_tasks(self): + logger.info("Stopping periodic tasks.") + if self.task.running: + self.task.stop() + if self._flush_states_task.running: + self._flush_states_task.stop() + if self._logging_task.running: + self._logging_task.stop() + + d = Deferred() + d.addBoth(self._perform_shutdown) + d.addBoth(self._stop_reactor) + return d + + def _stop_reactor(self, _=None): + logger.info("Stopping reactor.") + try: + reactor.stop() + except RuntimeError: # raised if already stopped or in shutdown stage + pass + + def _perform_shutdown(self, _=None): + self.flush_states() logger.info("Closing crawling strategy.") self.strategy.close() logger.info("Stopping frontier manager.") self._manager.stop() + logger.info("Closing message bus.") + self.scoring_log_producer.close() + self.consumer.close() def on_add_seeds(self, seeds): logger.debug('Adding %i seeds', len(seeds)) diff --git a/requirements.txt b/requirements.txt index 7c718e7af..ac8e2ab03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ six>=1.8.0 w3lib>=1.15.0 +cityhash>=0.1.7 diff --git a/setup.py b/setup.py index 1885c74a7..518e28f23 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,8 @@ ], install_requires=[ 'six>=1.8.0', - 'w3lib>=1.15.0' + 'w3lib>=1.15.0', + 'cityhash>=0.1.7' ], extras_require={ 'sql': [ From 14383a94f577d6dcaa21158fd8117f859c43e155 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 10 Mar 2017 15:40:13 +0100 Subject: [PATCH 057/273] increasing heartbeat interval for all consumers --- frontera/contrib/messagebus/kafkabus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 7f9e121a4..bc3d9a0de 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -42,6 +42,7 @@ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, + heartbeat_interval_ms=10000, **kwargs ) From cdfd34c286d398f109692153db2b89bb7e8488dc Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 26 Oct 2017 17:47:37 +0200 Subject: [PATCH 058/273] comment unused --- frontera/contrib/backends/hbase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 145591700..92419fda4 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -198,9 +198,9 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries = 0 count = 0 prefix = '%d_' % partition_id - now_ts = int(time()) + # now_ts = int(time()) # TODO: figure out how to use filter here, Thrift filter above causes full scan - filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) + # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 From 2c813c72d965dafbcf6d5be8893d8fb0388299af Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 26 Oct 2017 17:50:12 +0200 Subject: [PATCH 059/273] removing resurrected code --- frontera/contrib/messagebus/kafka/__init__.py | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/frontera/contrib/messagebus/kafka/__init__.py b/frontera/contrib/messagebus/kafka/__init__.py index 5d7b58a00..e69de29bb 100644 --- a/frontera/contrib/messagebus/kafka/__init__.py +++ b/frontera/contrib/messagebus/kafka/__init__.py @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - -from collections import namedtuple -from logging import getLogger - -from kafka import KafkaClient -from kafka.common import OffsetRequestPayload, check_error, OffsetFetchRequestPayload - -logger = getLogger("offset-fetcher") -OffsetsStruct = namedtuple("OffsetsStruct", ["commit", "produced"]) - - -class OffsetsFetcherSync(object): - def __init__(self, location, topic, group_id): - self._client = KafkaClient(location) - self._topic = topic - self._group_id = group_id - self._client.load_metadata_for_topics() - self._offsets = OffsetsStruct(commit=dict(), - produced=dict()) - self._update_group_offsets() - self._update_produced_offsets() - - def _update_produced_offsets(self): - """ - Arguments: - request_time_ms (int): Used to ask for all messages before a - certain time (ms). There are two special values. Specify -1 to receive the latest - offset (i.e. the offset of the next coming message) and -2 to receive the earliest - available offset. Note that because offsets are pulled in descending order, asking for - the earliest offset will always return you a single element. - """ - for partition in self._client.get_partition_ids_for_topic(self._topic): - reqs = [OffsetRequestPayload(self._topic, partition, -1, 1)] - - (resp,) = self._client.send_offset_request(reqs) - - check_error(resp) - assert resp.topic == self._topic - assert resp.partition == partition - self._offsets.produced[partition] = resp.offsets[0] - - def _update_group_offsets(self): - logger.info("Consumer fetching stored offsets") - for partition in self._client.get_partition_ids_for_topic(self._topic): - (resp,) = self._client.send_offset_fetch_request_kafka( - self._group_id, - [OffsetFetchRequestPayload(self._topic, partition)], - fail_on_error=False) - try: - check_error(resp) - except Exception as exc: - logger.error(exc) - pass - - if resp.offset == -1: - self._offsets.commit[partition] = None - else: - self._offsets.commit[partition] = resp.offset - - def get(self): - """ - :return: dict Lags per partition - """ - self._update_produced_offsets() - self._update_group_offsets() - - lags = {} - for partition in self._client.get_partition_ids_for_topic(self._topic): - produced = self._offsets.produced[partition] - lag = produced - self._offsets.commit[partition] if self._offsets.commit[partition] else 0.0 - logger.debug("%s (%s): %s, %s, %s", self._topic, partition, produced, self._offsets.commit[partition], lag) - lags[partition] = lag - return lags \ No newline at end of file From c797fbac74be790d4f1c1fd2008d5a499d3c535a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 26 Oct 2017 18:07:49 +0200 Subject: [PATCH 060/273] reverting fingerprinting logic back --- frontera/utils/fingerprint.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/frontera/utils/fingerprint.py b/frontera/utils/fingerprint.py index 17e65f21f..b491ebcb6 100644 --- a/frontera/utils/fingerprint.py +++ b/frontera/utils/fingerprint.py @@ -26,12 +26,11 @@ def hostname_local_fingerprint(key): :return: str 20 bytes hex string """ result = parse_url(key) - if not result.hostname: - return sha1(key) - host_checksum = get_crc32(result.hostname) - doc_uri_combined = result.path+';'+result.params+result.query+result.fragment + hostname = result.hostname if result.hostname else '-' + host_checksum = get_crc32(hostname) + combined = hostname+result.path+';'+result.params+result.query+result.fragment - doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') - doc_fprint = hashlib.md5(doc_uri_combined).digest() + combined = to_bytes(combined, 'utf8', 'ignore') + doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) - return fprint + return fprint \ No newline at end of file From ed40fd2082006456256e58a037455724c6cb697e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 26 Oct 2017 18:36:05 +0200 Subject: [PATCH 061/273] settings for OverusedBuffer --- docs/source/topics/frontera-settings.rst | 23 +++++++++++++++++++ docs/source/topics/loggers.rst | 1 + frontera/contrib/backends/memory/__init__.py | 5 +++- .../contrib/backends/remote/messagebus.py | 4 ++-- frontera/core/__init__.py | 19 ++++++++------- frontera/settings/default_settings.py | 2 ++ 6 files changed, 43 insertions(+), 11 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 559b29738..d88eafccf 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -262,6 +262,29 @@ Default: ``30.0`` Used in DB worker, and it's a time interval between production of new batches for all partitions. If partition is busy, it will be skipped. + +.. setting:: OVERUSED_MAX_PER_KEY + +OVERUSED_MAX_PER_KEY +-------------------- + +Default: ``None`` + +The maximum number of keys to store per OverusedBuffer slot. When the specified amount is reached the purging is +performed leaving 0.1 * specified value requests. + + +.. setting:: OVERUSED_MAX_KEYS + +OVERUSED_MAX_KEYS +----------------- + +Default: ``None`` + +The maximum number of slots in OverusedBuffer. When this limit is reached the purging is performed +leaving 0.1 * specified value keys. + + .. setting:: OVERUSED_SLOT_FACTOR OVERUSED_SLOT_FACTOR diff --git a/docs/source/topics/loggers.rst b/docs/source/topics/loggers.rst index 1714ed778..aced204ab 100644 --- a/docs/source/topics/loggers.rst +++ b/docs/source/topics/loggers.rst @@ -20,6 +20,7 @@ Loggers used * sqlalchemy.states * sqlalchemy.queue * offset-fetcher +* overusedbuffer * messagebus-backend * cf-server * db-worker diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index e96cd29a8..22727cf43 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -248,7 +248,10 @@ def _create_queue(self, settings): class MemoryDFSOverusedBackend(MemoryDFSBackend): def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) - self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests) + settings = manager.settings + self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests, + settings.get("OVERUSED_MAX_QUEUE_SIZE"), + settings.get("OVERUSED_MAX_KEYS")) def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index f3827c22a..ce3614a13 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -26,8 +26,8 @@ def __init__(self, manager): self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") - self._buffer = OverusedBuffer(self._get_next_requests, - self._logger.debug) + self._buffer = OverusedBuffer(self._get_next_requests, settings.get("OVERUSED_MAX_QUEUE_SIZE"), + settings.get("OVERUSED_MAX_KEYS")) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index 8f6c6f012..f8a823c32 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -30,7 +30,7 @@ class OverusedBuffer(object): A buffering object for implementing the buffer of Frontera requests for overused domains/ips. It can be used when customizing backend to address efficient downloader pool usage. """ - def __init__(self, _get_func, log_func=None): + def __init__(self, _get_func, max_per_key, max_keys): """ :param _get_func: reference to get_next_requests() method of binded class :param log_func: optional logging function, for logging of internal state @@ -38,10 +38,10 @@ def __init__(self, _get_func, log_func=None): self._pending = defaultdict(deque) self._get = _get_func self._log = getLogger("overusedbuffer") - self._max_per_key = 100 - self._keep_per_key = 10 - self._max_keys = 1000 - self._keep_keys = 100 + self._max_per_key = max_per_key + self._keep_per_key = int(max_per_key * 0.1) if max_per_key else None + self._max_keys = max_keys + self._keep_keys = int(max_keys * 0.1) if max_keys else None def _get_key(self, request, type): return get_slot_key(request, type) @@ -58,7 +58,8 @@ def _get_pending(self, max_n_requests, overused_set): try: yield pending[key].popleft() # contacts-crawler strategy related hack - self._check_and_purge(key) + if self._max_per_key: + self._check_and_purge(key) i += 1 except IndexError: keys.discard(key) @@ -90,7 +91,8 @@ def get_next_requests(self, max_n_requests, **kwargs): if self._log.isEnabledFor(DEBUG): self._log.debug("Overused keys: %s", str(kwargs['overused_keys'])) self._log.debug("Pending: %i", (sum([len(pending) for pending in six.itervalues(self._pending)]))) - self._check_and_purge_keys() + if self._max_keys: + self._check_and_purge_keys() overused_set = set(kwargs['overused_keys']) requests = list(self._get_pending(max_n_requests, overused_set)) @@ -102,7 +104,8 @@ def get_next_requests(self, max_n_requests, **kwargs): if key in overused_set: self._pending[key].append(request) # contacts-crawler strategy related hack - self._check_and_purge(key) + if self._max_per_key: + self._check_and_purge(key) else: requests.append(request) return requests diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index c0e8797ff..b75db08e5 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -31,6 +31,8 @@ ] NEW_BATCH_DELAY = 30.0 OVERUSED_SLOT_FACTOR = 5.0 +OVERUSED_MAX_PER_KEY = None +OVERUSED_MAX_KEYS = None QUEUE_HOSTNAME_PARTITIONING = False REDIS_BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' REDIS_HOST = 'localhost' From 1657a2dfd26a5f3b147d13578c2276fb357ae340 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 1 Nov 2017 17:31:48 +0100 Subject: [PATCH 062/273] fixing tests --- frontera/contrib/backends/hbase.py | 10 ++- frontera/core/__init__.py | 5 +- tests/contrib/backends/hbase/test_hbase.py | 2 +- tests/test_core_overused_buffer.py | 78 +++++++++++++++------- tests/test_partitioners.py | 4 +- 5 files changed, 68 insertions(+), 31 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 92419fda4..8a22f4e2e 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -68,7 +68,7 @@ class HBaseQueue(Queue): GET_RETRIES = 3 - def __init__(self, connection, partitions, table_name, drop=False): + def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) @@ -80,8 +80,11 @@ def __init__(self, connection, partitions, table_name, drop=False): self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) + schema = {'f': {'max_versions': 1}} + if use_snappy: + schema['f']['compression'] = 'SNAPPY' if self.table_name not in tables: - self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'compression': 'SNAPPY'}}) + self.connection.create_table(self.table_name, schema) class DumbResponse: pass @@ -456,7 +459,8 @@ def db_worker(cls, manager): settings = manager.settings drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') o._queue = HBaseQueue(o.connection, o.queue_partitions, - settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables) + settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables, + use_snappy=settings.get('HBASE_USE_SNAPPY')) o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index f8a823c32..0732eab6d 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -49,6 +49,9 @@ def _get_key(self, request, type): def _get_pending_count(self): return sum(six.moves.map(len, six.itervalues(self._pending))) + def _get_key_count(self): + return len(self._pending) + def _get_pending(self, max_n_requests, overused_set): pending = self._pending i, keys = 0, set(pending) - overused_set @@ -90,7 +93,7 @@ def _check_and_purge_keys(self): def get_next_requests(self, max_n_requests, **kwargs): if self._log.isEnabledFor(DEBUG): self._log.debug("Overused keys: %s", str(kwargs['overused_keys'])) - self._log.debug("Pending: %i", (sum([len(pending) for pending in six.itervalues(self._pending)]))) + self._log.debug("Pending: %i", self._get_pending_count()) if self._max_keys: self._check_and_purge_keys() overused_set = set(kwargs['overused_keys']) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index a6cc44b10..260dbf6c1 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -43,7 +43,7 @@ def test_metadata(self): def test_queue(self): connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 2, b'queue', True) + queue = HBaseQueue(connection, 2, b'queue', drop=True, use_snappy=False) batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), ('12', 0.7, r3, True)] queue.schedule(batch) diff --git a/tests/test_core_overused_buffer.py b/tests/test_core_overused_buffer.py index f08e32933..7801f9c3a 100644 --- a/tests/test_core_overused_buffer.py +++ b/tests/test_core_overused_buffer.py @@ -2,6 +2,9 @@ from frontera.core import OverusedBuffer from frontera.core.models import Request from six.moves import range +from itertools import cycle +from random import choice, sample +from string import ascii_lowercase r1 = Request('http://www.example.com') @@ -14,46 +17,73 @@ class TestOverusedBuffer(object): - requests = [] - logs = [] + requests = [r1, r2, r3, r4, r5, r6] - def get_func(self, max_n_requests, **kwargs): + def get_once(self, max_n_requests, **kwargs): lst = [] for _ in range(max_n_requests): - if self.requests: - lst.append(self.requests.pop()) + try: + lst.append(next(self.req_it)) + except StopIteration: + break return lst - def log_func(self, msg): - self.logs.append(msg) + def test_base(self): + self.req_it = iter(self.requests) + ob = OverusedBuffer(self.get_once, 100, 10000) - def test(self): - ob = OverusedBuffer(self.get_func, self.log_func) - self.requests = [r1, r2, r3, r4, r5, r6] + assert ob._get_pending_count() == 0 assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) - assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']", - "Pending: 0"]) - self.logs = [] - + assert ob._get_pending_count() == 4 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 4"]) - self.logs = [] + assert ob._get_pending_count() == 3 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 3"]) - self.logs = [] + assert ob._get_pending_count() == 3 #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set(ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) - assert set(self.logs) == set(["Overused keys: ['example.com']", - "Pending: 3"]) - self.logs = [] + assert ob._get_pending_count() == 0 assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: []", "Pending: 0"]) + assert ob._get_pending_count() == 0 + + def test_purging_keys(self): + self.req_it = cycle(self.requests) + ob = OverusedBuffer(self.get_once, 10, 100) + ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], + key_type="domain") + assert ob._get_pending_count() == 9 + ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], + key_type="domain") # purging of www.example.com + assert ob._get_pending_count() == 7 + + def generate_requests(self): + def get_random_host(): + return str("").join([choice(ascii_lowercase) for i in range(5)]) + + self.hosts = set() + for _ in range(21): + self.hosts.add(get_random_host()) + self.requests = [] + for host in self.hosts: + self.requests.append(Request("http://%s/" % (host))) + + + def test_purging_keys_set(self): + self.generate_requests() + self.req_it = cycle(self.requests) + ob = OverusedBuffer(self.get_once, 1000, 10) + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") + assert (ob._get_key_count()) == 10 + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") + assert (ob._get_key_count()) == 20 + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") # purging of keys set + assert (ob._get_key_count()) < 20 diff --git a/tests/test_partitioners.py b/tests/test_partitioners.py index 61f52ada8..4e530ddc9 100644 --- a/tests/test_partitioners.py +++ b/tests/test_partitioners.py @@ -9,10 +9,10 @@ def test_fingerprint_partitioner(): fp = FingerprintPartitioner(partitions) key = '1be68ff556fd0bbe5802d1a100850da29f7f15b1' partition = fp.partition(key, partitions) - assert partition == 4 + assert partition == 2 partition = fp.partition(key, None) - assert partition == 4 + assert partition == 2 def test_crc32name_partitioner(): From e57c763289b8f88a354dee5ab8ac76dd4fab8dea Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 1 Nov 2017 17:49:31 +0100 Subject: [PATCH 063/273] adding zmq broker output --- .travis.yml | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e56db0d0a..09632f0c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -58,6 +58,7 @@ script: tox after_success: - codecov + - cat broker.log deploy: provider: pypi diff --git a/tox.ini b/tox.ini index 9e47f6eac..7a725999a 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements/tests.txt commands = - py.test --cov-report=term --cov=frontera -s -v {posargs:tests} + py.test --cov-report=term --cov=frontera -s -v {posargs:tests} -k test_zmq [testenv:flake8] changedir = {toxinidir} From baed69e92eeb665e5ededd39e97b417aa0e588ae Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 1 Nov 2017 17:54:18 +0100 Subject: [PATCH 064/273] more work --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 09632f0c9..5dfdb5587 100644 --- a/.travis.yml +++ b/.travis.yml @@ -58,6 +58,8 @@ script: tox after_success: - codecov + +after_script: - cat broker.log deploy: From 8c5f63857910ca1c98d834586b4aa67f8a52f888 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 1 Nov 2017 18:01:03 +0100 Subject: [PATCH 065/273] adding cityhash to test reqs --- requirements/tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/tests.txt b/requirements/tests.txt index d73107e03..82efdbce3 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -15,3 +15,4 @@ boto>=2.42.0 -r logging.txt redis>=2.10.5 hiredis>=0.2 +cityhash>=0.1.7 From b0500665936b2e246a48cc510e6aed910429cd23 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 1 Nov 2017 18:05:26 +0100 Subject: [PATCH 066/273] enabling all tests --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 7a725999a..9e47f6eac 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements/tests.txt commands = - py.test --cov-report=term --cov=frontera -s -v {posargs:tests} -k test_zmq + py.test --cov-report=term --cov=frontera -s -v {posargs:tests} [testenv:flake8] changedir = {toxinidir} From e8dcd4860d7c2fdd544b7b563233df4f700cf42a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 2 Nov 2017 18:14:27 +0100 Subject: [PATCH 067/273] converting row prefix to bytes --- frontera/contrib/backends/hbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 8a22f4e2e..fbd059551 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -200,7 +200,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): limit = min_requests tries = 0 count = 0 - prefix = '%d_' % partition_id + prefix = to_bytes('%d_' % partition_id) # now_ts = int(time()) # TODO: figure out how to use filter here, Thrift filter above causes full scan # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) From 90b3530268e009aa84ef26a27502051993ef90a0 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 2 Nov 2017 18:18:29 +0100 Subject: [PATCH 068/273] style --- frontera/contrib/backends/hbase.py | 2 +- frontera/contrib/messagebus/kafka/async.py | 8 +++----- frontera/contrib/scrapy/schedulers/frontier.py | 2 +- frontera/contrib/scrapy/schedulers/recording.py | 2 +- frontera/worker/strategy.py | 2 ++ 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index fbd059551..ede89e3a1 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -212,7 +212,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): # filter=filter + for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): # filter=filter for cq, buf in six.iteritems(data): if cq == b'f:t': continue diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index c491fa272..e8965c91a 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -249,7 +249,7 @@ def _handle_offset_response(self, partitions, future, response): log.debug("Fetched offset %s for partition %d", offsets[0], part) result.append((TopicPartition(topic, part), offsets[0])) elif error_type in (Errors.NotLeaderForPartitionError, - Errors.UnknownTopicOrPartitionError): + Errors.UnknownTopicOrPartitionError): log.debug("Attempt to fetch offsets for partition %s failed due" " to obsolete leadership information, retrying.", str(partitions)) @@ -283,7 +283,7 @@ def fetch_committed_offsets(self, partitions): return future.value if not future.retriable(): - raise future.exception # pylint: disable-msg=raising-bad-type + raise future.exception # pylint: disable-msg=raising-bad-type time.sleep(self.config['retry_backoff_ms'] / 1000.0) @@ -386,9 +386,7 @@ def get(self): log.info("No partitions available, performing metadata update.") self._client.poll(future=future) return {} - partitions = [TopicPartition(self.topic, partition_id) - for partition_id in topic_partitions] - + partitions = [TopicPartition(self.topic, partition_id) for partition_id in topic_partitions] offsets = self.offsets(partitions, -1) committed = self.fetch_committed_offsets(partitions) lags = {} diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index 782a64c23..75592c192 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -162,7 +162,7 @@ def _get_pending_request(self): def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: + except Exception: return '?' def _request_is_redirected(self, request): diff --git a/frontera/contrib/scrapy/schedulers/recording.py b/frontera/contrib/scrapy/schedulers/recording.py index bcaed25e3..29bd671e8 100644 --- a/frontera/contrib/scrapy/schedulers/recording.py +++ b/frontera/contrib/scrapy/schedulers/recording.py @@ -144,5 +144,5 @@ def process_exception(self, request, exception, spider): def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: + except Exception: return '?' diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 14759cf3d..50b239077 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -223,9 +223,11 @@ def log_failure(failure): logger.exception(failure.value) if failure.frames: logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + def errback_main(failure): log_failure(failure) self.task.start(interval=0).addErrback(errback_main) + def errback_flush_states(failure): log_failure(failure) self._flush_states_task.start(interval=300).addErrback(errback_flush_states) From 679c8c07ad149abd4020836c52f9d30e33465c56 Mon Sep 17 00:00:00 2001 From: preetwinder Date: Wed, 15 Mar 2017 22:29:19 +0530 Subject: [PATCH 069/273] add message type logging and time for batch --- frontera/worker/db.py | 12 +++++++++++- frontera/worker/strategy.py | 10 +++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 45a0ecd62..153103771 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -104,7 +104,12 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring): self.stats = { 'consumed_since_start': 0, 'consumed_scoring_since_start': 0, - 'pushed_since_start': 0 + 'pushed_since_start': 0, + 'consumed_add_seeds': 0, + 'consumed_page_crawled': 0, + 'consumed_links_extracted': 0, + 'consumed_request_error': 0, + 'consumed_offset': 0 } self._logging_task = task.LoopingCall(self.log_status) @@ -183,6 +188,7 @@ def consume_incoming(self, *args, **kwargs): for seed in seeds: logger.debug('URL: %s', seed.url) self._backend.add_seeds(seeds) + self.stats['consumed_add_seeds'] += 1 continue if type == 'page_crawled': _, response = msg @@ -190,6 +196,7 @@ def consume_incoming(self, *args, **kwargs): if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: continue self._backend.page_crawled(response) + self.stats['consumed_page_crawled'] += 1 continue if type == 'links_extracted': _, request, links = msg @@ -197,6 +204,7 @@ def consume_incoming(self, *args, **kwargs): if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: continue self._backend.links_extracted(request, links) + self.stats['consumed_links_extracted'] += 1 continue if type == 'request_error': _, request, error = msg @@ -204,6 +212,7 @@ def consume_incoming(self, *args, **kwargs): if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: continue self._backend.request_error(request, error) + self.stats['consumed_request_error'] += 1 continue if type == 'offset': _, partition_id, offset = msg @@ -219,6 +228,7 @@ def consume_incoming(self, *args, **kwargs): self.spider_feed.mark_ready(partition_id) else: self.spider_feed.mark_busy(partition_id) + self.stats['consumed_offset'] += 1 continue logger.debug('Unknown message type %s', type) except Exception as exc: diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 50b239077..81b0671c9 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -109,7 +109,11 @@ def __init__(self, settings, strategy_class): self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { - 'consumed_since_start': 0 + 'consumed_since_start': 0, + 'consumed_add_seeds': 0, + 'consumed_page_crawled': 0, + 'consumed_links_extracted': 0, + 'consumed_request_error': 0 } self.job_id = 0 self.task = LoopingCall(self.work) @@ -176,24 +180,28 @@ def process_batch(self, batch): for seed in seeds: seed.meta[b'jid'] = self.job_id self.on_add_seeds(seeds) + self.stats['consumed_add_seeds'] += 1 continue if type == 'page_crawled': _, response = msg if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: continue self.on_page_crawled(response) + self.stats['consumed_page_crawled'] += 1 continue if type == 'links_extracted': _, request, links = msg if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: continue self.on_links_extracted(request, links) + self.stats['consumed_links_extracted'] += 1 continue if type == 'request_error': _, request, error = msg if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: continue self.on_request_error(request, error) + self.stats['consumed_request_error'] += 1 continue self.on_unknown_message(msg) except Exception as exc: From 312dd58e7c4030393fbe837acda561177f947736 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 23 Mar 2017 13:10:48 +0100 Subject: [PATCH 070/273] smaller poll intervals --- frontera/contrib/messagebus/kafkabus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index bc3d9a0de..c7b18f667 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -59,7 +59,7 @@ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): self._consumer._update_fetch_positions(self._partition_ids) self._start_looping_call() - def _start_looping_call(self, interval=60): + def _start_looping_call(self, interval=10): def errback(failure): logger.exception(failure.value) if failure.frames: From da3d0c704373aa4276e0f7938174074bf3f02138 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 27 Mar 2017 14:14:29 +0200 Subject: [PATCH 071/273] more logging in offset-fetcher --- frontera/contrib/messagebus/kafka/async.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index e8965c91a..70cbbb9ea 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -180,6 +180,7 @@ def offsets(self, partitions, timestamp): refresh_future = self._client.cluster.request_update() self._client.poll(future=refresh_future, sleep=True) ok = False + log.warning("Got exception %s and kept the loop.", future.exception) break if ok: return offsets From 87f9e53d8e03ba84c24aff307d71846818a62f84 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 27 Mar 2017 18:17:16 +0200 Subject: [PATCH 072/273] logging contents of e --- frontera/contrib/messagebus/kafka/async.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index 70cbbb9ea..c345a822b 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -212,6 +212,7 @@ def _send_offset_request(self, partitions, timestamp): # Client returns a future that only fails on network issues # so create a separate future and attach a callback to update it # based on response error codes + futures = [] for node_id, partitions in six.iteritems(nodes_per_partitions): request = OffsetRequest[0]( @@ -220,7 +221,10 @@ def _send_offset_request(self, partitions, timestamp): future_request = Future() _f = self._client.send(node_id, request) _f.add_callback(self._handle_offset_response, partitions, future_request) - _f.add_errback(lambda e: future_request.failure(e)) + def errback(e): + log.info("the future is %s", e) + future_request.failure(e) + _f.add_errback(errback) futures.append(future_request) return futures From bec18770871357717e9d50637b110af96f4a137f Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 28 Mar 2017 13:53:41 +0200 Subject: [PATCH 073/273] change err message and raising kafka log level to INFO --- frontera/contrib/messagebus/kafka/async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index c345a822b..8a66f855a 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -222,7 +222,7 @@ def _send_offset_request(self, partitions, timestamp): _f = self._client.send(node_id, request) _f.add_callback(self._handle_offset_response, partitions, future_request) def errback(e): - log.info("the future is %s", e) + log.error("Offset request errback error %s", e) future_request.failure(e) _f.add_errback(errback) futures.append(future_request) From c2e57a62346e06be50938a2ba1a4f2b54d623477 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 30 Mar 2017 13:54:56 +0200 Subject: [PATCH 074/273] fix of 'Future' object is not iterable --- frontera/contrib/messagebus/kafka/async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index 8a66f855a..1a6a24243 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -202,11 +202,11 @@ def _send_offset_request(self, partitions, timestamp): if node_id is None: log.debug("Partition %s is unknown for fetching offset," " wait for metadata refresh", partition) - return Future().failure(Errors.StaleMetadata(partition)) + return [Future().failure(Errors.StaleMetadata(partition))] elif node_id == -1: log.debug("Leader for partition %s unavailable for fetching offset," " wait for metadata refresh", partition) - return Future().failure(Errors.LeaderNotAvailableError(partition)) + return [Future().failure(Errors.LeaderNotAvailableError(partition))] nodes_per_partitions.setdefault(node_id, []).append(partition) # Client returns a future that only fails on network issues From 1ccbd42217190e4616202bad01622d48ac7421fd Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 29 May 2017 14:51:54 +0200 Subject: [PATCH 075/273] simplifying logging configuration --- frontera/worker/db.py | 2 +- frontera/worker/strategy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 45a0ecd62..eaa1b1f76 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -336,7 +336,7 @@ def get_fingerprint(request): logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): - fileConfig(logging_config_path) + fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 50b239077..cccd24dc7 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -344,7 +344,7 @@ def setup_environment(): logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): - fileConfig(logging_config_path) + fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) From a27120602e372993756e710ee6f1fe36a13ea816 Mon Sep 17 00:00:00 2001 From: Sun Wei Date: Wed, 27 Dec 2017 10:25:35 +0800 Subject: [PATCH 076/273] Update hbase.py imho, in hbase queue's `get_next_requests` method, params: min_hosts and max_requests_per_host are optional, because this method will check whether they are None, eg line no.225 and no.236, but kwargs.pop will raise Exception if I don't give these two params. --- frontera/contrib/backends/hbase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index ede89e3a1..17942c8b7 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -190,8 +190,8 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): :return: list of :class:`Request ` objects. """ min_requests = kwargs.pop('min_requests') - min_hosts = kwargs.pop('min_hosts') - max_requests_per_host = kwargs.pop('max_requests_per_host') + min_hosts = kwargs.pop('min_hosts', None) + max_requests_per_host = kwargs.pop('max_requests_per_host', None) assert(max_n_requests > min_requests) table = self.connection.table(self.table_name) From b69e3188c5c117677752061f3e655a2f48df9c73 Mon Sep 17 00:00:00 2001 From: Sun Wei Date: Wed, 27 Dec 2017 13:08:02 +0800 Subject: [PATCH 077/273] maybe miss b? --- examples/cluster/bc/broadcrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/bc/broadcrawl/__init__.py b/examples/cluster/bc/broadcrawl/__init__.py index 6c22dcbcd..8fb2ccf94 100644 --- a/examples/cluster/bc/broadcrawl/__init__.py +++ b/examples/cluster/bc/broadcrawl/__init__.py @@ -74,7 +74,7 @@ def links_extracted(self, request, links): self._schedule_and_count(links) def page_error(self, request, error): - request.meta['state'] = States.ERROR + request.meta[b'state'] = States.ERROR self.schedule(request, score=0.0, dont_queue=True) def _schedule_and_count(self, links): From fbc6fc2f4f4e09f34dde232bc03c7d2973ad6b6e Mon Sep 17 00:00:00 2001 From: Sun Wei Date: Wed, 27 Dec 2017 17:10:02 +0800 Subject: [PATCH 078/273] update BCPerHostLimit in cluster example the `from_worker` method is duplicate , base class `BaseCrawlingStrategy` define `from_worker` already. --- examples/cluster/bc/broadcrawl/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/cluster/bc/broadcrawl/__init__.py b/examples/cluster/bc/broadcrawl/__init__.py index 6c22dcbcd..45db7c234 100644 --- a/examples/cluster/bc/broadcrawl/__init__.py +++ b/examples/cluster/bc/broadcrawl/__init__.py @@ -58,10 +58,6 @@ def __init__(self, manager, mb_stream, states_context): self.logger = logging.getLogger("bcperhostlimit-strategy") super(BCPerHostLimit, self).__init__(manager, mb_stream, states_context) - @classmethod - def from_worker(cls, manager, mb_scheduler, states_context): - return cls(manager, mb_scheduler, states_context) - def add_seeds(self, seeds): self._schedule_and_count(seeds) From cbeee882481caa61fcdbe482c37f14564dda1643 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 15 Mar 2017 12:23:44 +0100 Subject: [PATCH 079/273] hbase states table option --- docs/source/topics/frontera-settings.rst | 9 +++++++++ frontera/contrib/backends/hbase.py | 21 +++++++++++++++------ frontera/contrib/messagebus/kafka/async.py | 1 + frontera/settings/default_settings.py | 1 + tests/contrib/backends/hbase/test_hbase.py | 7 +++++-- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index d88eafccf..ee5ce2c89 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -580,6 +580,15 @@ Default: ``3000000`` Number of items in the :term:`state cache` of :term:`strategy worker`, before it get's flushed to HBase and cleared. +.. setting:: HBASE_STATES_TABLE + +HBASE_STATES_TABLE +^^^^^^^^^^^^^^^^^^ + +Default: ``states`` + +Name of the table used by :term:`strategy worker` to store link states. + .. setting:: HBASE_THRIFT_HOST HBASE_THRIFT_HOST diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index ede89e3a1..029d8dbfb 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -276,13 +276,24 @@ def count(self): class HBaseState(States): - def __init__(self, connection, table_name, cache_size_limit): + def __init__(self, connection, table_name, cache_size_limit, drop_all_tables): self.connection = connection - self._table_name = table_name + self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") self._state_cache = {} self._cache_size_limit = cache_size_limit + tables = set(connection.tables()) + if drop_all_tables and self._table_name in tables: + connection.delete_table(self._table_name, disable=True) + tables.remove(self._table_name) + + if self._table_name not in tables: + schema = {'s': {'max_versions': 1, 'block_cache_enabled': 1, + 'bloom_filter_type': 'ROW', 'in_memory': True, } + } + connection.create_table(self._table_name, schema) + def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] @@ -335,8 +346,6 @@ def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_si if self._table_name not in tables: schema = {'m': {'max_versions': 1}, - 's': {'max_versions': 1, 'block_cache_enabled': 1, - 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} } if use_snappy: @@ -449,8 +458,8 @@ def __init__(self, manager): def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings - o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'), - settings.get('HBASE_STATE_CACHE_SIZE_LIMIT')) + o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'), + settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES')) return o @classmethod diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index e8965c91a..cbec753c9 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -221,6 +221,7 @@ def _send_offset_request(self, partitions, timestamp): _f.add_callback(self._handle_offset_response, partitions, future_request) _f.add_errback(lambda e: future_request.failure(e)) futures.append(future_request) + return futures def _handle_offset_response(self, partitions, future, response): diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b75db08e5..fd783020a 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -16,6 +16,7 @@ HBASE_NAMESPACE = 'crawler' HBASE_DROP_ALL_TABLES = False HBASE_METADATA_TABLE = 'metadata' +HBASE_STATES_TABLE = 'states' HBASE_USE_SNAPPY = False HBASE_USE_FRAMED_COMPACT = False HBASE_BATCH_SIZE = 9216 diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 260dbf6c1..c79881a30 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -71,7 +71,7 @@ def test_queue_with_delay(self): def test_state(self): connection = Connection(host='hbase-docker', port=9090) - state = HBaseState(connection, b'metadata', 300000) + state = HBaseState(connection, b'states', 300000, True) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 state.update_cache([r1, r2, r3]) @@ -100,13 +100,16 @@ def test_drop_all_tables_when_table_name_is_str(self): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' + hbase_states_table = 'states' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': {'max_versions': 1}}) + connection.create_table(hbase_states_table, {'f': {'max_versions': 1}}) tables = connection.tables() - assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself + assert set(tables) == set([b'metadata', b'queue', b'states']) # Failure of test itself try: HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) + HBaseState(connection, hbase_states_table, 100, True) except AlreadyExists: assert False, "failed to drop hbase tables" From 9d90f6bfde043f59fe410c76ed009c01139f34f1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 24 Jan 2018 12:11:41 +0100 Subject: [PATCH 080/273] reducing logging verbosity --- tests/test_message_bus.py | 2 +- tox.ini | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_message_bus.py b/tests/test_message_bus.py index b283a7405..9575b5d16 100644 --- a/tests/test_message_bus.py +++ b/tests/test_message_bus.py @@ -121,7 +121,7 @@ def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) logger.addHandler(handler) kafka_location = "127.0.0.1:9092" diff --git a/tox.ini b/tox.ini index 9e47f6eac..705b27b61 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements/tests.txt commands = - py.test --cov-report=term --cov=frontera -s -v {posargs:tests} + py.test --cov-report=term --cov=frontera -v {posargs:tests} [testenv:flake8] changedir = {toxinidir} @@ -32,6 +32,7 @@ exclude = frontera/_version.py,versioneer.py,docs/source/conf.py,frontera/contri # Options for pytest [pytest] -addopts = -rsvXf +addopts = -rvXf testpaths = tests ignore=requirements +log_cli_level=INFO From 285e03b56888a05ad3f8791fc98d02cc53462964 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 27 Jun 2017 19:09:17 +0300 Subject: [PATCH 081/273] Send crawl stats to Kafka message bus Add Scrapy extension to export stats to Kafka Use mixin class to create new db/sw classes Extend messagebus class with stats logic Extend codec logic with encoding/decoding stats Use export-to-tsdb logic from bigbot-common package Use single logic for forming SSL params --- .../contrib/backends/remote/codecs/msgpack.py | 7 +- frontera/contrib/messagebus/kafkabus.py | 61 +++++------ frontera/contrib/messagebus/stats.py | 100 ++++++++++++++++++ .../contrib/messagebus/zeromq/__init__.py | 4 + frontera/contrib/scrapy/messagebus_stats.py | 79 ++++++++++++++ frontera/core/codec.py | 10 ++ frontera/core/messagebus.py | 16 +++ frontera/settings/default_settings.py | 5 +- frontera/utils/misc.py | 14 ++- frontera/worker/db.py | 27 ++++- frontera/worker/strategy.py | 16 ++- tests/test_codecs.py | 8 +- 12 files changed, 309 insertions(+), 38 deletions(-) create mode 100644 frontera/contrib/messagebus/stats.py create mode 100644 frontera/contrib/scrapy/messagebus_stats.py diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 5a155ec7b..fd245a518 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -68,6 +68,9 @@ def encode_new_job_id(self, job_id): def encode_offset(self, partition_id, offset): return packb([b'of', int(partition_id), int(offset)], use_bin_type=True) + def encode_stats(self, stats): + return packb([b'st', stats], use_bin_type=True) + class Decoder(BaseDecoder): def __init__(self, request_model, response_model, *a, **kw): @@ -109,7 +112,9 @@ def decode(self, buffer): return ('new_job_id', int(obj[1])) if obj[0] == b'of': return ('offset', int(obj[1]), int(obj[2])) - raise TypeError('Unknown message type') + if obj[0] == b'st': + return ('stats', obj[1]) + return TypeError('Unknown message type') def decode_request(self, buffer): return self._request_from_object(unpackb(buffer, encoding='utf-8')) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index c7b18f667..d71b6db0f 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -19,6 +19,16 @@ logger = getLogger("messagebus.kafka") +def _prepare_kafka_ssl_kwargs(cert_path): + """Prepare SSL kwargs for Kafka producer/consumer.""" + return { + 'security_protocol': 'SSL', + 'ssl_cafile': os_path_join(cert_path, 'ca-cert.pem'), + 'ssl_certfile': os_path_join(cert_path, 'client-cert.pem'), + 'ssl_keyfile': os_path_join(cert_path, 'client-key.pem') + } + + class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. @@ -27,14 +37,7 @@ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): self._location = location self._group = group self._topic = topic - kwargs = {} - if enable_ssl: - kwargs.update({ - 'security_protocol': 'SSL', - 'ssl_cafile': os_path_join(cert_path, 'ca-cert.pem'), - 'ssl_certfile': os_path_join(cert_path, 'client-cert.pem'), - 'ssl_keyfile': os_path_join(cert_path, 'client-key.pem') - }) + kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, @@ -109,14 +112,7 @@ def __init__(self, location, enable_ssl, cert_path, topic, compression): self._create(enable_ssl, cert_path) def _create(self, enable_ssl, cert_path): - kwargs = {} - if enable_ssl: - kwargs.update({ - 'security_protocol': 'SSL', - 'ssl_cafile': os_path_join(cert_path, 'ca-cert.pem'), - 'ssl_certfile': os_path_join(cert_path, 'client-cert.pem'), - 'ssl_keyfile': os_path_join(cert_path, 'client-key.pem') - }) + kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, compression_type=self._compression, @@ -139,14 +135,7 @@ def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, com self._topic_done = topic_done self._partitioner = partitioner self._compression = compression - kwargs = {} - if enable_ssl: - kwargs.update({ - 'security_protocol': 'SSL', - 'ssl_cafile': os_path_join(cert_path, 'ca-cert.pem'), - 'ssl_certfile': os_path_join(cert_path, 'client-cert.pem'), - 'ssl_keyfile': os_path_join(cert_path, 'client-key.pem') - }) + kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._producer = KafkaProducer(bootstrap_servers=self._location, partitioner=partitioner, retries=5, @@ -207,12 +196,7 @@ def __init__(self, messagebus): 'group_id': self._general_group, } if self._enable_ssl: - kwargs.update({ - 'security_protocol': 'SSL', - 'ssl_cafile': os_path_join(self._cert_path, 'ca-cert.pem'), - 'ssl_certfile': os_path_join(self._cert_path, 'client-cert.pem'), - 'ssl_keyfile': os_path_join(self._cert_path, 'client-key.pem') - }) + kwargs.update(_prepare_kafka_ssl_kwargs(self._cert_path)) self._offset_fetcher = OffsetsFetcherAsync(**kwargs) self._codec = messagebus.codec self._partitions = messagebus.spider_feed_partitions @@ -252,15 +236,29 @@ def producer(self): return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec) +class StatsLogStream(ScoringLogStream): + """Stats log stream implementation for Kafka message bus. + + The interface is the same as for scoring log stream, so it's better + to reuse it with proper topic and group. + """ + def __init__(self, messagebus): + super(StatsLogStream, self).__init__(messagebus) + self._topic = messagebus.topic_stats + self._group = messagebus.statslog_reader_group + + class MessageBus(BaseMessageBus): def __init__(self, settings): self.topic_todo = settings.get('SPIDER_FEED_TOPIC') self.topic_done = settings.get('SPIDER_LOG_TOPIC') self.topic_scoring = settings.get('SCORING_LOG_TOPIC') + self.topic_stats = settings.get('STATS_LOG_TOPIC') self.spiderlog_dbw_group = settings.get('SPIDER_LOG_DBW_GROUP') self.spiderlog_sw_group = settings.get('SPIDER_LOG_SW_GROUP') self.scoringlog_dbw_group = settings.get('SCORING_LOG_DBW_GROUP') + self.statslog_reader_group = settings.get('STATS_LOG_READER_GROUP') self.spider_feed_group = settings.get('SPIDER_FEED_GROUP') self.spider_partition_id = settings.get('SPIDER_PARTITION_ID') self.max_next_requests = settings.MAX_NEXT_REQUESTS @@ -280,3 +278,6 @@ def spider_feed(self): def scoring_log(self): return ScoringLogStream(self) + + def stats_log(self): + return StatsLogStream(self) \ No newline at end of file diff --git a/frontera/contrib/messagebus/stats.py b/frontera/contrib/messagebus/stats.py new file mode 100644 index 000000000..bad7f07c1 --- /dev/null +++ b/frontera/contrib/messagebus/stats.py @@ -0,0 +1,100 @@ +from logging import getLogger +from traceback import format_tb + +from twisted.internet.task import LoopingCall + +from frontera.utils.misc import load_object, utc_timestamp + + +logger = getLogger("messagebus.stats") + + +class StatsExportMixin(object): + """Extending Frontera worker's logic by sending stats to message bus. + + This is a lightweight mixin class to be used with a base worker classes + by sending stats to message bus if configured. The mixin also allows + you to define your custom logic for get_stats_tags() logic in your child + classes to store a dictionary with tags as a part of your metrics. + """ + STATS_PREFIXES = ['consumed', 'pushed'] + + def __init__(self, settings, *args, **kwargs): + super(StatsExportMixin, self).__init__(settings, *args, **kwargs) + message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) + stats_log = message_bus.stats_log() + # FIXME can be removed after implementing stats_log for ZeroMQ bus + if not stats_log: + return + self.stats_producer = stats_log.producer() + self._stats_tags = self.get_stats_tags(settings, *args, **kwargs) + self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) + self._export_stats_task = LoopingCall(self.export_stats) + + def run(self): + + def errback_export_stats(failure): + logger.exception(failure.value) + if failure.frames: + logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + self._export_stats_task.start(interval=self._stats_interval)\ + .addErrback(errback_export_stats) + + if self.stats_producer: + self._export_stats_task.start(interval=self._stats_interval)\ + .addErrback(errback_export_stats) + super(StatsExportMixin, self).run() + + def get_stats_tags(self, *args, **kwargs): + """Get a tags dictionary for the metrics. + + Default implementation expects that this method will provide: + - 'source' - source type of the metric, one of ['sw', 'dbw', 'spider'] + - 'partition_id' (optionally) - specific partition id + """ + raise NotImplemented("Please define the method in a child class") + + @property + def _stats_key_prefix(self): + """Build key prefix based on the given tags. + + Default implementation of the method relies on get_stats_tags() logic, + and existence of 'source'/'partition_id' tags. + """ + prefix = self._stats_tags.get('source') + if 'partition_id' in self._stats_tags: + prefix += '-{}'.format(self._stats_tags.get('partition_id')) + return prefix + + def export_stats(self): + """Export crawl stats to message bus. + + Message is formed in the following way: + - key: a prefix from _stats_key_prefix() + stats timestamp + - value: a stats dictionary packed with self._encoder + """ + stats = self.get_stats() + if not stats: + return + stats_key = '{}-{}'.format(self._stats_key_prefix, stats['_timestamp']) + # self._encoder is defined as a part of worker initialization + encoded_msg = self._encoder.encode_stats(stats) + self.stats_producer.send(stats_key, encoded_msg) + logger.info("Sent stats for {} to message bus: {}".format(stats_key, stats)) + + def get_stats(self): + """Return default stats with a timestamp. + + It's useful to have a default implementation of the method because both + strategy and db worker store stats this way, though this logic could be + modified in a child class to redefine/transform stats data. + """ + # report only stats with given prefixes, no need to push all of them + stats = {stats_key: self.stats[stats_key] + for stats_key in self.stats + if stats_key.split('_', 1)[0] in self.STATS_PREFIXES} + if not stats: + return + stats['_timestamp'] = utc_timestamp() + stats['_tags'] = self._stats_tags + return stats diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index 3f99fc973..ca6b5e951 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -221,3 +221,7 @@ def scoring_log(self): def spider_feed(self): return SpiderFeedStream(self) + + def stats_log(self): + # FIXME implement it for completeness + self.logger.warning("Exporting stats is not implemented for ZeroMQ yet") diff --git a/frontera/contrib/scrapy/messagebus_stats.py b/frontera/contrib/scrapy/messagebus_stats.py new file mode 100644 index 000000000..c39f09e79 --- /dev/null +++ b/frontera/contrib/scrapy/messagebus_stats.py @@ -0,0 +1,79 @@ +import logging +from traceback import format_tb + +from scrapy import signals +from scrapy.exceptions import NotConfigured +from twisted.internet.task import LoopingCall + +from frontera.contrib.scrapy.settings_adapter import ScrapySettingsAdapter +from frontera.utils.misc import utc_timestamp, load_object + + +logger = logging.getLogger(__name__) + +# scrapy stats ignored by the exporter by default +STATS_DEFAULT_BLACKLIST = [ + "start_time", +] + + +class StatsExporterToMessageBus(object): + """Export crawl stats to message bus.""" + + def __init__(self, crawler): + settings = ScrapySettingsAdapter(crawler.settings) + self.partition_id = settings.get('SPIDER_PARTITION_ID') + # XXX this can be improved later by reusing spider's producer + # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) + # but the topic is hard-coded in the current scheme, so it requires some + # preliminary changes in Frontera itself. + message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) + stats_log = message_bus.stats_log() + if not stats_log: + raise NotConfigured + self.stats_producer = stats_log.producer() + self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) + codec_path = settings.get('MESSAGE_BUS_CODEC') + encoder_cls = load_object(codec_path + ".Encoder") + self._stats_encoder = encoder_cls(request_model=None) # no need to encode requests + self._export_stats_task = None + + @classmethod + def from_crawler(cls, crawler): + obj = cls(crawler) + crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) + return obj + + def spider_opened(self, spider): + + def errback_export_stats(failure): + logger.exception(failure.value) + if failure.frames: + logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + self._export_stats_task.start(self._stats_interval)\ + .addErrback(errback_export_stats) + + self._export_stats_task = LoopingCall(self.export_stats, spider) + self._export_stats_task.start(self._stats_interval)\ + .addErrback(errback_export_stats) + + def spider_closed(self, spider): + if self._export_stats_task: + self._export_stats_task.stop() + self._export_stats_task = None + self.stats_producer.flush() + self.stats_producer.close() + + def export_stats(self, spider): + all_stats = spider.crawler.stats.get_stats() + stats = {key: all_stats[key] for key in all_stats + if key not in STATS_DEFAULT_BLACKLIST} + if not stats: + return # no need to send empty stats + stats['_timestamp'] = utc_timestamp() + stats['_tags'] = {'source': 'spider', 'partition_id': self.partition_id} + key = 'spider-{}-{}'.format(self.partition_id, stats['_timestamp']) + encoded_msg = self._stats_encoder.encode_stats(stats) + self.stats_producer.send(key, encoded_msg) + logger.debug("Sent spider stats to message bus: %s", stats) diff --git a/frontera/core/codec.py b/frontera/core/codec.py index 45f6e0068..5e618b611 100644 --- a/frontera/core/codec.py +++ b/frontera/core/codec.py @@ -118,3 +118,13 @@ def encode_offset(self, partition_id, offset): :return: bytes encoded message """ pass + + @abstractmethod + def encode_stats(self, stats): + """ + Encodes current crawl stats. + + :param stats: a dictionary with stats + :return: bytes encoded message + """ + pass \ No newline at end of file diff --git a/frontera/core/messagebus.py b/frontera/core/messagebus.py index 3782f6c00..6b8a6db95 100644 --- a/frontera/core/messagebus.py +++ b/frontera/core/messagebus.py @@ -124,6 +124,14 @@ def producer(self): raise NotImplementedError +@six.add_metaclass(ABCMeta) +class BaseStatsLogStream(BaseScoringLogStream): + """ + Stats log stream base class. This stream is transfering stats metrics from workers and spiders to external + data sources. This type of stream isn't requiring any partitioning. + """ + + @six.add_metaclass(ABCMeta) class BaseSpiderFeedStream(object): """ @@ -204,3 +212,11 @@ def spider_feed(self): :return: instance of SpiderFeedStream """ raise NotImplementedError + + @abstractmethod + def stats_log(self): + """ + Create or return stats log stream. + :return: instance of StatsLogStream + """ + raise NotImplementedError \ No newline at end of file diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index fd783020a..4de309fb9 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -47,6 +47,7 @@ SPIDER_LOG_PARTITIONS = 1 SPIDER_FEED_PARTITIONS = 1 SPIDER_PARTITION_ID = 0 +STATS_LOG_PARTITIONS = 1 SQLALCHEMYBACKEND_CACHE_SIZE = 10000 SQLALCHEMYBACKEND_CLEAR_CONTENT = True SQLALCHEMYBACKEND_DROP_ALL_TABLES = True @@ -78,12 +79,14 @@ SPIDER_FEED_TOPIC = "frontier-todo" SPIDER_LOG_TOPIC = "frontier-done" SCORING_LOG_TOPIC = "frontier-score" +STATS_LOG_TOPIC = 'frontier-stats' SPIDER_LOG_DBW_GROUP = "dbw-spider-log" SPIDER_LOG_SW_GROUP = "sw-spider-log" SCORING_LOG_DBW_GROUP = "dbw-scoring-log" SPIDER_FEED_GROUP = "fetchers-spider-feed" +STATS_LOG_READER_GROUP = 'stats-reader-log' KAFKA_CODEC = None KAFKA_CERT_PATH = '/mnt/mesos/sandbox' -KAFKA_ENABLE_SSL = False \ No newline at end of file +KAFKA_ENABLE_SSL = False diff --git a/frontera/utils/misc.py b/frontera/utils/misc.py index 15731195f..823bd0371 100644 --- a/frontera/utils/misc.py +++ b/frontera/utils/misc.py @@ -1,9 +1,17 @@ from __future__ import absolute_import -from importlib import import_module + +import time +import calendar from zlib import crc32 +from importlib import import_module + +import six from six.moves import range from w3lib.util import to_bytes -import six + + +def utc_timestamp(): + return calendar.timegm(time.gmtime()) def load_object(path): @@ -72,4 +80,4 @@ def dict_to_unicode(obj): if isinstance(obj, list): return map(dict_to_unicode, obj) else: - return obj \ No newline at end of file + return obj diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 4ddf8f2b5..07468ac7e 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -19,6 +19,8 @@ from frontera.utils.misc import load_object from frontera.utils.async import CallLaterOnce from frontera.utils.ossignal import install_shutdown_handlers + +from frontera.contrib.messagebus.stats import StatsExportMixin from .server import WorkerJsonRpcService import six from six.moves import map @@ -69,7 +71,9 @@ def cancel(self): self.consumption.cancel() -class DBWorker(object): +class BaseDBWorker(object): + """Base database worker class.""" + def __init__(self, settings, no_batches, no_incoming, no_scoring): messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) @@ -325,6 +329,27 @@ def get_fingerprint(request): return count +class DBWorker(StatsExportMixin, BaseDBWorker): + """Main database worker class with useful extensions. + + The additional features are provided by using mixin classes: + - sending crawl stats to message bus + """ + def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): + if no_batches and no_scoring: + db_worker_type = 'linksdb' + elif no_batches and no_incoming: + db_worker_type = 'scoring' + elif no_incoming and no_scoring: + db_worker_type = 'batchgen' + else: + logger.warning("Can't identify DB worker type " + "(no-scoring {}, no-batches {}, no-incoming {})" + .format(no_scoring, no_batches, no_incoming)) + db_worker_type = 'none' + return {'source': 'dbw-{}'.format(db_worker_type)} + + if __name__ == '__main__': parser = ArgumentParser(description="Frontera DB worker.") parser.add_argument('--no-batches', action='store_true', diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 93768f388..105be8d05 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -13,6 +13,8 @@ from frontera.core.manager import FrontierManager from frontera.logger.handlers import CONSOLE +from frontera.contrib.messagebus.stats import StatsExportMixin + from twisted.internet.task import LoopingCall from twisted.internet import reactor from twisted.internet.defer import Deferred @@ -82,7 +84,9 @@ def flush(self): logger.info("Flushing of states finished") -class StrategyWorker(object): +class BaseStrategyWorker(object): + """Base strategy worker class.""" + def __init__(self, settings, strategy_class): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: @@ -326,6 +330,16 @@ def on_request_error(self, request, error): self.states.update_cache(request) +class StrategyWorker(StatsExportMixin, BaseStrategyWorker): + """Main strategy worker class with useful extensions. + + The additional features are provided by using mixin classes: + - sending crawl stats to message bus + """ + def get_stats_tags(self, settings, *args, **kwargs): + return {'source': 'sw', 'partition_id': settings.get('SCORING_PARTITION_ID')} + + def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") parser.add_argument('--config', type=str, required=True, diff --git a/tests/test_codecs.py b/tests/test_codecs.py index e4d59348c..6cf9ee588 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -56,6 +56,8 @@ def check_request(req1, req2): req = Request(url="http://www.yandex.ru", method=b'GET', meta={b'test': b'shmest', b'scrapy_meta': {'rule': 0, 'key': 'value'}}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") + stats = {'_timestamp': 1499241748, 'tags': {'source': 'spider', 'partition_id': 0}, + 'crawled_pages_count': 2, 'links_extracted_count': 3} msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled(Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, @@ -66,6 +68,7 @@ def check_request(req1, req2): enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req), + enc.encode_stats(stats), invalid_value, ] @@ -120,10 +123,13 @@ def check_request(req1, req2): o = dec.decode_request(next(it)) check_request(o, req) + o_type, stats = dec.decode(next(id)) + assert o_type == 'stats' + assert stats == stats + with pytest.raises(TypeError): dec.decode(next(it)) - class TestEncodeDecodeJson(unittest.TestCase): """ Test for testing methods `_encode_recursively` and `_decode_recursively` used in json codec From 3c16f22f07b4d1dffe98b66d80e11b327ceb7fbe Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 10 Jul 2017 15:36:44 +0300 Subject: [PATCH 082/273] Add Mesos task id as a tag for DB worker metrics --- frontera/worker/db.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 07468ac7e..fc87fb8cb 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import + +import os import logging from traceback import format_stack from signal import signal, SIGUSR1 @@ -347,7 +349,12 @@ def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): "(no-scoring {}, no-batches {}, no-incoming {})" .format(no_scoring, no_batches, no_incoming)) db_worker_type = 'none' - return {'source': 'dbw-{}'.format(db_worker_type)} + tags = {'source': 'dbw-{}'.format(db_worker_type)} + # add mesos task id as a tag if running via marathon + mesos_task_id = os.environ.get('MESOS_TASK_ID') + if mesos_task_id: + tags['mesos_task_id'] = mesos_task_id + return tags if __name__ == '__main__': From 5aaff5fd4a845a6ebda6ea13fe847ceb85dbd384 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 10 Jul 2017 19:21:05 +0300 Subject: [PATCH 083/273] Send backend stats to message bus Move happybase to dependencies folder Add get_stats() to distributed backend interface Provide useful context manager to measure elapsed time Use the manager to estimate get_stats() efficiency --- frontera/contrib/backends/hbase.py | 10 +++++++++- frontera/contrib/messagebus/stats.py | 1 + frontera/core/components.py | 11 +++++++++++ frontera/utils/misc.py | 20 ++++++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 64df9a055..942341d33 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -5,7 +5,7 @@ from frontera.core.components import Metadata, Queue, States from frontera.core.models import Request from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.utils.misc import chunks, get_crc32 +from frontera.utils.misc import chunks, get_crc32, time_elapsed from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder from happybase import Connection @@ -527,3 +527,11 @@ def get_next_requests(self, max_next_requests, **kwargs): next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages + + def get_stats(self): + """Helper to get stats dictionary for the backend. + + For now it provides only HBase client stats. + """ + with time_elapsed('Call HBase backend get_stats()'): + return self.connection.client.get_stats() diff --git a/frontera/contrib/messagebus/stats.py b/frontera/contrib/messagebus/stats.py index bad7f07c1..623c1d3c7 100644 --- a/frontera/contrib/messagebus/stats.py +++ b/frontera/contrib/messagebus/stats.py @@ -93,6 +93,7 @@ def get_stats(self): stats = {stats_key: self.stats[stats_key] for stats_key in self.stats if stats_key.split('_', 1)[0] in self.STATS_PREFIXES} + stats.update(self._manager.backend.get_stats() or {}) if not stats: return stats['_timestamp'] = utc_timestamp() diff --git a/frontera/core/components.py b/frontera/core/components.py index 33529c7bc..7017e2102 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -257,6 +257,17 @@ def strategy_worker(cls, manager): def db_worker(cls, manager): raise NotImplementedError + def get_stats(self): + """ + Returns a dictionary with distributed backend stats. + + Depending on a backend type the method may return different stats to be sent to a message bus. + Called by :class:`StatsExportMixin ` for workers. + + :return: dict of stats key/values. + """ + return None + class Partitioner(object): """ diff --git a/frontera/utils/misc.py b/frontera/utils/misc.py index 823bd0371..7ca830a21 100644 --- a/frontera/utils/misc.py +++ b/frontera/utils/misc.py @@ -1,8 +1,10 @@ from __future__ import absolute_import import time +import logging import calendar from zlib import crc32 +from timeit import default_timer from importlib import import_module import six @@ -10,6 +12,9 @@ from w3lib.util import to_bytes +logger = logging.getLogger("utils.misc") + + def utc_timestamp(): return calendar.timegm(time.gmtime()) @@ -81,3 +86,18 @@ def dict_to_unicode(obj): return map(dict_to_unicode, obj) else: return obj + + +class time_elapsed(object): + """Useful context manager to measure elapsed time.""" + + def __init__(self, name): + self.name = name + + def __enter__(self): + self.start = default_timer() + + def __exit__(self, ty, val, tb): + end = default_timer() + logger.debug("%s : %0.3f seconds" % (self.name, end-self.start)) + return False From c296a864302adb6d7e83e73fd321b098d402b8b2 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 26 Jul 2017 16:18:41 +0300 Subject: [PATCH 084/273] No need to setup stats log partitions --- frontera/settings/default_settings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 4de309fb9..748429104 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -47,7 +47,6 @@ SPIDER_LOG_PARTITIONS = 1 SPIDER_FEED_PARTITIONS = 1 SPIDER_PARTITION_ID = 0 -STATS_LOG_PARTITIONS = 1 SQLALCHEMYBACKEND_CACHE_SIZE = 10000 SQLALCHEMYBACKEND_CLEAR_CONTENT = True SQLALCHEMYBACKEND_DROP_ALL_TABLES = True From ea8fb1833ea36e515fd78222430389856ea8d797 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 2 Aug 2017 19:49:50 +0300 Subject: [PATCH 085/273] Decrease log-level for stats producer --- frontera/contrib/messagebus/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/messagebus/stats.py b/frontera/contrib/messagebus/stats.py index 623c1d3c7..0e086a034 100644 --- a/frontera/contrib/messagebus/stats.py +++ b/frontera/contrib/messagebus/stats.py @@ -80,7 +80,7 @@ def export_stats(self): # self._encoder is defined as a part of worker initialization encoded_msg = self._encoder.encode_stats(stats) self.stats_producer.send(stats_key, encoded_msg) - logger.info("Sent stats for {} to message bus: {}".format(stats_key, stats)) + logger.debug("Sent stats for {} to message bus: {}".format(stats_key, stats)) def get_stats(self): """Return default stats with a timestamp. From 7589208d4250b4d63cda7e1634db765d5d181b54 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 12 Jul 2017 18:53:30 +0300 Subject: [PATCH 086/273] lru cache for HBaseState --- docs/source/topics/frontera-settings.rst | 18 +++++-- frontera/contrib/backends/hbase.py | 53 +++++++++---------- frontera/contrib/backends/memory/__init__.py | 4 +- .../contrib/backends/sqlalchemy/components.py | 4 +- frontera/core/components.py | 4 +- frontera/settings/default_settings.py | 3 +- frontera/worker/strategy.py | 18 ++++--- tests/contrib/backends/hbase/test_hbase.py | 40 +++++++++----- 8 files changed, 83 insertions(+), 61 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index ee5ce2c89..d6afe815b 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -378,10 +378,10 @@ Determines if content should be sent over the message bus and stored in the back SW_FLUSH_INTERVAL ----------------- -Default: ``900`` +Default: ``300`` -Mean interval between flushing of states in :term:`strategy worker`. Selected randomly using formula -SW_FLUSH_INTERVAL + RANDINT(-SW_FLUSH_INTERVAL/2, SW_FLUSH_INTERVAL/2) +Interval between flushing of states in :term:`strategy worker`. Also used to set initial random delay to flush states +periodically, using formula ``RANDINT(SW_FLUSH_INTERVAL)``. .. setting:: TEST_MODE @@ -571,6 +571,15 @@ Default: ``queue`` Name of HBase priority queue table. +.. settings:: HBASE_STATE_WRITE_LOG_SIZE + +HBASE_STATE_WRITE_LOG_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``15000`` + +Number of state changes in the :term:`state cache` of :term:`strategy worker`, before it get's flushed to HBase and cleared. + .. setting:: HBASE_STATE_CACHE_SIZE_LIMIT HBASE_STATE_CACHE_SIZE_LIMIT @@ -578,7 +587,8 @@ HBASE_STATE_CACHE_SIZE_LIMIT Default: ``3000000`` -Number of items in the :term:`state cache` of :term:`strategy worker`, before it get's flushed to HBase and cleared. +Number of cached state changes in the :term:`state cache` of :term:`strategy worker`. Internally there is ``cachetools.LRUCache`` +storing all the recent state changes, discarding least recently used when the cache gets over its capacity. .. setting:: HBASE_STATES_TABLE diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 64df9a055..e1967a8fc 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -13,6 +13,7 @@ import six from six.moves import range from w3lib.util import to_bytes +from cachetools import LRUCache from struct import pack, unpack from datetime import datetime @@ -276,12 +277,14 @@ def count(self): class HBaseState(States): - def __init__(self, connection, table_name, cache_size_limit, drop_all_tables): + def __init__(self, connection, table_name, cache_size_limit, + write_log_size, drop_all_tables): self.connection = connection self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") - self._state_cache = {} - self._cache_size_limit = cache_size_limit + self._state_cache = LRUCache(maxsize=cache_size_limit) + self._state_batch = self.connection.table( + self._table_name).batch(batch_size=write_log_size) tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: @@ -296,36 +299,27 @@ def __init__(self, connection, table_name, cache_size_limit, drop_all_tables): def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] - - def put(obj): - self._state_cache[obj.meta[b'fingerprint']] = obj.meta[b'state'] - [put(obj) for obj in objs] + for obj in objs: + fingerprint, state = obj.meta[b'fingerprint'], obj.meta[b'state'] + # prepare & write state change to happybase batch + self._state_batch.put(unhexlify(fingerprint), prepare_hbase_object(state=state)) + # update LRU cache with the state update + self._state_cache[fingerprint] = state def set_states(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] + for obj in objs: + obj.meta[b'state'] = self._state_cache.get(obj.meta[b'fingerprint'], States.DEFAULT) - def get(obj): - fprint = obj.meta[b'fingerprint'] - obj.meta[b'state'] = self._state_cache[fprint] if fprint in self._state_cache else States.DEFAULT - [get(obj) for obj in objs] - - def flush(self, force_clear): - if len(self._state_cache) > self._cache_size_limit: - force_clear = True - table = self.connection.table(self._table_name) - for chunk in chunks(list(self._state_cache.items()), 32768): - with table.batch(transaction=True) as b: - for fprint, state in chunk: - hb_obj = prepare_hbase_object(state=state) - b.put(unhexlify(fprint), hb_obj) - if force_clear: - self.logger.debug("Cache has %d requests, clearing" % len(self._state_cache)) - self._state_cache.clear() + def flush(self): + self._state_batch.send() def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] - self.logger.debug("cache size %s" % len(self._state_cache)) - self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) + if not to_fetch: + return + self.logger.debug('Fetching %d/%d elements from HBase (cache size %d)', + len(to_fetch), len(fingerprints), len(self._state_cache)) for chunk in chunks(to_fetch, 65536): keys = [unhexlify(fprint) for fprint in chunk] table = self.connection.table(self._table_name) @@ -458,8 +452,11 @@ def __init__(self, manager): def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings - o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'), - settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES')) + o._states = HBaseState(connection=o.connection, + table_name=settings.get('HBASE_STATES_TABLE'), + cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), + write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), + drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) return o @classmethod diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index 22727cf43..2e6deb7ff 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -146,10 +146,8 @@ def set_states(self, objs): def fetch(self, fingerprints): pass - def flush(self, force_clear=False): + def flush(self): if len(self._cache) > self._cache_size_limit: - force_clear = True - if force_clear: self.logger.debug("Cache has %d items, clearing", len(self._cache)) self._cache.clear() diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index 8661ac576..ab99a0fac 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -135,13 +135,13 @@ def fetch(self, fingerprints): self._cache[to_bytes(state.fingerprint)] = state.state @retry_and_rollback - def flush(self, force_clear=False): + def flush(self): for fingerprint, state_val in six.iteritems(self._cache): state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) self.session.merge(state) self.session.commit() self.logger.debug("State cache has been flushed.") - super(States, self).flush(force_clear) + super(States, self).flush() class Queue(BaseQueue): diff --git a/frontera/core/components.py b/frontera/core/components.py index 33529c7bc..624db43cb 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -126,11 +126,9 @@ def set_states(self, objs): raise NotImplementedError @abstractmethod - def flush(self, force_clear): + def flush(self): """ Flushes internal cache to storage. - - :param force_clear: boolean, True - signals to clear cache after flush """ raise NotImplementedError diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index fd783020a..9907f1f3a 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -21,6 +21,7 @@ HBASE_USE_FRAMED_COMPACT = False HBASE_BATCH_SIZE = 9216 HBASE_STATE_CACHE_SIZE_LIMIT = 3000000 +HBASE_STATE_WRITE_LOG_SIZE = 15000 HBASE_QUEUE_TABLE = 'queue' KAFKA_GET_TIMEOUT = 5.0 MAX_NEXT_REQUESTS = 64 @@ -61,7 +62,7 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False -SW_FLUSH_INTERVAL = 900 +SW_FLUSH_INTERVAL = 300 TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 93768f388..bb0e335d7 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -14,7 +14,7 @@ from frontera.core.manager import FrontierManager from frontera.logger.handlers import CONSOLE from twisted.internet.task import LoopingCall -from twisted.internet import reactor +from twisted.internet import reactor, task from twisted.internet.defer import Deferred from frontera.settings import Settings @@ -78,7 +78,7 @@ def release(self): def flush(self): logger.info("Flushing states") - self._states.flush(force_clear=False) + self._states.flush() logger.info("Flushing of states finished") @@ -119,8 +119,7 @@ def __init__(self, settings, strategy_class): self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) - flush_interval = settings.get("SW_FLUSH_INTERVAL") - self._flush_interval = flush_interval + randint(-flush_interval / 2, flush_interval / 2) + self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info("Strategy worker is initialized and consuming partition %d", partition_id) def collect_unknown_message(self, msg): @@ -236,9 +235,13 @@ def errback_main(failure): log_failure(failure) self.task.start(interval=0).addErrback(errback_main) + def run_flush_states_task(): + (self._flush_states_task.start(interval=self._flush_interval) + .addErrback(errback_flush_states)) + def errback_flush_states(failure): log_failure(failure) - self._flush_states_task.start(interval=300).addErrback(errback_flush_states) + run_flush_states_task() def debug(sig, frame): logger.critical("Signal received: printing stack trace") @@ -247,7 +250,10 @@ def debug(sig, frame): install_shutdown_handlers(self._handle_shutdown) self.task.start(interval=0).addErrback(errback_main) self._logging_task.start(interval=30) - self._flush_states_task.start(interval=self._flush_interval).addErrback(errback_flush_states) + # run flushing states LoopingCall with random delay + flush_states_task_delay = randint(0, self._flush_interval) + logger.info("Starting flush-states task in %d seconds", flush_states_task_delay) + task.deferLater(reactor, flush_states_task_delay, run_flush_states_task) signal(SIGUSR1, debug) reactor.run(installSignalHandlers=False) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index c79881a30..4b8400d41 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -1,6 +1,13 @@ from __future__ import absolute_import + +from time import sleep, time +from binascii import unhexlify + +from msgpack import unpackb from happybase import Connection +from w3lib.util import to_native_str from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase + from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue from frontera.core.models import Request, Response from frontera.core.components import States @@ -55,7 +62,7 @@ def test_queue(self): @pytest.mark.xfail def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 1, b'queue', True) + queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at @@ -71,28 +78,31 @@ def test_queue_with_delay(self): def test_state(self): connection = Connection(host='hbase-docker', port=9090) - state = HBaseState(connection, b'states', 300000, True) + state = HBaseState(connection, b'states', cache_size_limit=300000, + write_log_size=5000, drop_all_tables=True) state.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 state.update_cache([r1, r2, r3]) - assert state._state_cache == {b'10': States.NOT_CRAWLED, - b'11': States.NOT_CRAWLED, - b'12': States.NOT_CRAWLED} + assert dict(state._state_cache) == {b'10': States.NOT_CRAWLED, + b'11': States.NOT_CRAWLED, + b'12': States.NOT_CRAWLED} + assert state._state_batch._mutation_count == 3 r1.meta[b'state'] = States.CRAWLED r2.meta[b'state'] = States.CRAWLED r3.meta[b'state'] = States.CRAWLED state.update_cache([r1, r2, r3]) - state.flush(True) - assert state._state_cache == {} + assert state._state_batch._mutation_count == 6 + state.flush() + assert state._state_batch._mutation_count == 0 state.fetch([b'10', b'11', b'12']) - assert state._state_cache == {b'10': States.CRAWLED, - b'11': States.CRAWLED, - b'12': States.CRAWLED} + assert dict(state._state_cache) == {b'10': States.CRAWLED, + b'11': States.CRAWLED, + b'12': States.CRAWLED} r4.meta[b'state'] = States.ERROR state.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.CRAWLED - state.flush(True) - assert state._state_cache == {} + state.flush() + assert state._state_batch._mutation_count == 0 def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) @@ -107,9 +117,11 @@ def test_drop_all_tables_when_table_name_is_str(self): tables = connection.tables() assert set(tables) == set([b'metadata', b'queue', b'states']) # Failure of test itself try: - HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) + HBaseQueue(connection=connection, partitions=1, + table_name=hbase_queue_table, use_snappy=False, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) - HBaseState(connection, hbase_states_table, 100, True) + HBaseState(connection, hbase_states_table, cache_size_limit=100, + write_log_size=10, drop_all_tables=True) except AlreadyExists: assert False, "failed to drop hbase tables" From 457b8ee637946b92a36e4bb3a0f824620cfdeb16 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 Jan 2018 10:02:43 +0100 Subject: [PATCH 087/273] codecs tests fixed --- frontera/contrib/backends/remote/codecs/json.py | 8 ++++++++ frontera/contrib/backends/remote/codecs/msgpack.py | 2 +- tests/test_codecs.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index 135c44d83..a2786d6cc 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -143,6 +143,12 @@ def encode_offset(self, partition_id, offset): 'offset': int(offset) }) + def encode_stats(self, stats): + return self.encode({ + 'type': 'stats', + 'stats': stats + }) + class Decoder(json.JSONDecoder, BaseDecoder): def __init__(self, request_model, response_model, *a, **kw): @@ -190,6 +196,8 @@ def decode(self, message): return ('new_job_id', int(message['job_id'])) if message['type'] == 'offset': return ('offset', int(message['partition_id']), int(message['offset'])) + if message['type'] == 'stats': + return ('stats', message['stats']) raise TypeError('Unknown message type') def decode_request(self, message): diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index fd245a518..8fe3921a5 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -114,7 +114,7 @@ def decode(self, buffer): return ('offset', int(obj[1]), int(obj[2])) if obj[0] == b'st': return ('stats', obj[1]) - return TypeError('Unknown message type') + raise TypeError('Unknown message type') def decode_request(self, buffer): return self._request_from_object(unpackb(buffer, encoding='utf-8')) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 6cf9ee588..321489ef2 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -123,7 +123,7 @@ def check_request(req1, req2): o = dec.decode_request(next(it)) check_request(o, req) - o_type, stats = dec.decode(next(id)) + o_type, stats = dec.decode(next(it)) assert o_type == 'stats' assert stats == stats From 5d4d98a6ac444ff80778684323155a6acf174c69 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 Jan 2018 10:09:26 +0100 Subject: [PATCH 088/273] other tests fix --- tests/mocks/message_bus.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/mocks/message_bus.py b/tests/mocks/message_bus.py index f8b6f582b..df869048c 100644 --- a/tests/mocks/message_bus.py +++ b/tests/mocks/message_bus.py @@ -88,6 +88,10 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) +class StatsLogStream(ScoringLogStream): + pass + + class FakeMessageBus(BaseMessageBus): def __init__(self, settings): @@ -103,3 +107,6 @@ def scoring_log(self): def spider_feed(self): return SpiderFeedStream(self) + + def stats_log(self): + return StatsLogStream(self) \ No newline at end of file From e5b946cc23af5244b5ea29335d39fef3600ca8e5 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 Jan 2018 10:57:33 +0100 Subject: [PATCH 089/273] moving stats to worker submodule --- frontera/worker/db.py | 2 +- frontera/{contrib/messagebus => worker}/stats.py | 3 +-- frontera/worker/strategy.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) rename frontera/{contrib/messagebus => worker}/stats.py (99%) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index fc87fb8cb..09e846029 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -22,7 +22,7 @@ from frontera.utils.async import CallLaterOnce from frontera.utils.ossignal import install_shutdown_handlers -from frontera.contrib.messagebus.stats import StatsExportMixin +from frontera.worker.stats import StatsExportMixin from .server import WorkerJsonRpcService import six from six.moves import map diff --git a/frontera/contrib/messagebus/stats.py b/frontera/worker/stats.py similarity index 99% rename from frontera/contrib/messagebus/stats.py rename to frontera/worker/stats.py index 0e086a034..b2f20d1ea 100644 --- a/frontera/contrib/messagebus/stats.py +++ b/frontera/worker/stats.py @@ -5,7 +5,6 @@ from frontera.utils.misc import load_object, utc_timestamp - logger = getLogger("messagebus.stats") @@ -98,4 +97,4 @@ def get_stats(self): return stats['_timestamp'] = utc_timestamp() stats['_tags'] = self._stats_tags - return stats + return stats \ No newline at end of file diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 105be8d05..ac627b70f 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -13,7 +13,7 @@ from frontera.core.manager import FrontierManager from frontera.logger.handlers import CONSOLE -from frontera.contrib.messagebus.stats import StatsExportMixin +from frontera.worker.stats import StatsExportMixin from twisted.internet.task import LoopingCall from twisted.internet import reactor From 852bef5db02dc45bac984249846f9e32140bf047 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 Jan 2018 11:08:41 +0100 Subject: [PATCH 090/273] base stats log, zmq stats log --- frontera/contrib/messagebus/kafkabus.py | 4 ++-- frontera/contrib/messagebus/zeromq/__init__.py | 17 ++++++++++++++--- .../contrib/messagebus/zeromq/socket_config.py | 6 ++++++ frontera/core/messagebus.py | 15 ++++++++++++++- tests/mocks/message_bus.py | 13 ++++++++++--- 5 files changed, 46 insertions(+), 9 deletions(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index d71b6db0f..cd53ed76f 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -10,7 +10,7 @@ from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner from frontera.contrib.messagebus.kafka.async import OffsetsFetcherAsync from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseSpiderFeedStream, \ - BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer + BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer, BaseStatsLogStream from twisted.internet.task import LoopingCall from traceback import format_tb from os.path import join as os_path_join @@ -236,7 +236,7 @@ def producer(self): return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec) -class StatsLogStream(ScoringLogStream): +class StatsLogStream(BaseStatsLogStream, ScoringLogStream): """Stats log stream implementation for Kafka message bus. The interface is the same as for scoring log stream, so it's better diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index ca6b5e951..f54d64989 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -8,7 +8,7 @@ import six from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseSpiderFeedStream, BaseScoringLogStream + BaseSpiderFeedStream, BaseScoringLogStream, BaseStatsLogStream from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner from frontera.contrib.messagebus.zeromq.socket_config import SocketConfig from six.moves import range @@ -194,6 +194,18 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) +class StatsLogStream(BaseStatsLogStream): + def __init__(self, messagebus): + self.context = messagebus.context + self.in_location = messagebus.socket_config.stats_out() + + def consumer(self): + pass + + def producer(self): + return Producer(self.context, self.in_location, b'st') + + class Context(object): zeromq = zmq.Context() @@ -223,5 +235,4 @@ def spider_feed(self): return SpiderFeedStream(self) def stats_log(self): - # FIXME implement it for completeness - self.logger.warning("Exporting stats is not implemented for ZeroMQ yet") + return StatsLogStream(self) diff --git a/frontera/contrib/messagebus/zeromq/socket_config.py b/frontera/contrib/messagebus/zeromq/socket_config.py index 6ecddf842..097034c9f 100644 --- a/frontera/contrib/messagebus/zeromq/socket_config.py +++ b/frontera/contrib/messagebus/zeromq/socket_config.py @@ -61,3 +61,9 @@ def db_out(self): TCP socket for outgoing DW messages """ return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 5) + + def stats_out(self): + """ + TCP socket for outgoing stats + """ + return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 6) diff --git a/frontera/core/messagebus.py b/frontera/core/messagebus.py index 6b8a6db95..795495728 100644 --- a/frontera/core/messagebus.py +++ b/frontera/core/messagebus.py @@ -125,11 +125,24 @@ def producer(self): @six.add_metaclass(ABCMeta) -class BaseStatsLogStream(BaseScoringLogStream): +class BaseStatsLogStream(object): """ Stats log stream base class. This stream is transfering stats metrics from workers and spiders to external data sources. This type of stream isn't requiring any partitioning. """ + @abstractmethod + def consumer(self): + """ + :return: BaseStreamConsumer instance + """ + raise NotImplementedError + + @abstractmethod + def producer(self): + """ + :return: BaseStreamProducer instance + """ + raise NotImplementedError @six.add_metaclass(ABCMeta) diff --git a/tests/mocks/message_bus.py b/tests/mocks/message_bus.py index df869048c..dd4be971c 100644 --- a/tests/mocks/message_bus.py +++ b/tests/mocks/message_bus.py @@ -1,5 +1,5 @@ from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseScoringLogStream, BaseSpiderFeedStream + BaseScoringLogStream, BaseSpiderFeedStream, BaseStatsLogStream class Consumer(BaseStreamConsumer): @@ -88,8 +88,15 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) -class StatsLogStream(ScoringLogStream): - pass +class StatsLogStream(BaseStatsLogStream): + def __init__(self, messagebus): + pass + + def producer(self): + return Producer() + + def consumer(self): + return Consumer() class FakeMessageBus(BaseMessageBus): From 500055fadb19adead7b36b516041164660daf52c Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 14 Jul 2017 14:27:30 +0300 Subject: [PATCH 091/273] Split DB worker to components based on threads Mv domains blacklist to settings Rely on NEW_BATCH_DELAY for batchgen backoff # Conflicts: # frontera/worker/db.py --- frontera/settings/default_settings.py | 1 + frontera/worker/components/__init__.py | 55 +++ frontera/worker/components/batch_generator.py | 98 +++++ .../worker/components/incoming_consumer.py | 121 ++++++ .../worker/components/scoring_consumer.py | 54 +++ frontera/worker/db.py | 359 +++++------------- frontera/worker/server.py | 12 +- 7 files changed, 438 insertions(+), 262 deletions(-) create mode 100644 frontera/worker/components/__init__.py create mode 100644 frontera/worker/components/batch_generator.py create mode 100644 frontera/worker/components/incoming_consumer.py create mode 100644 frontera/worker/components/scoring_consumer.py diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 5d0ffe005..b7d65ac1c 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -32,6 +32,7 @@ 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', ] NEW_BATCH_DELAY = 30.0 +DOMAINS_BLACKLIST = None OVERUSED_SLOT_FACTOR = 5.0 OVERUSED_MAX_PER_KEY = None OVERUSED_MAX_KEYS = None diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py new file mode 100644 index 000000000..24ddbffa2 --- /dev/null +++ b/frontera/worker/components/__init__.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import time +import logging +import threading + +from frontera.exceptions import NotConfigured + +from twisted.internet import reactor, task, threads + + +class DBWorkerComponent(object): + """Base class for DB worker component. + + The class defines a single interface for DB worker components: you should + mainly implement only .run() method representing a single component iteration. + """ + + NAME = None + + def __init__(self, worker, settings, stop_event, *args, **kwargs): + self.worker = worker + self.settings = settings + self.stop_event = stop_event + self.logger = logging.getLogger('db-worker.{}'.format(self.NAME)) + self.run_backoff = 0 # replace it with a proper value in subclass + + def schedule(self): + return threads.deferToThread(self.loop) + + def loop(self): + """Main entrypoint for the thread running loop.""" + while not self.stop_event.is_set(): + try: + self.run() + except Exception as exc: + self.logger.exception('Exception in the main loop') + if self.run_backoff: + self.logger.debug('Sleep for {} seconds before next run()' + .format(self.run_backoff)) + time.sleep(self.run_backoff) + self.logger.debug("Main loop was stopped") + self.close() + + def run(self): + """Logic for single iteration of the component.""" + raise NotImplementedError + + def update_stats(self, **kwargs): + """Helper to update worker stats.""" + threads.blockingCallFromThread(reactor, self.worker.update_stats, **kwargs) + + def close(self): + """Optional method to do some clean-up before exiting main loop.""" diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py new file mode 100644 index 000000000..9347ac878 --- /dev/null +++ b/frontera/worker/components/batch_generator.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import threading +from time import asctime + +from six.moves import map + +from frontera.exceptions import NotConfigured +from frontera.utils.url import parse_domain_from_url_fast +from . import DBWorkerComponent + + +class BatchGenerator(DBWorkerComponent): + """Component to get data from backend and send it to spider feed log.""" + + NAME = 'batchgen' + + def __init__(self, worker, settings, stop_event, no_batches=False, **kwargs): + super(BatchGenerator, self).__init__(worker, settings, stop_event, **kwargs) + if no_batches: + raise NotConfigured('BatchGenerator is disabled with --no-batches') + + self.run_backoff = settings.get('NEW_BATCH_DELAY') + self.backend = worker.backend + self.spider_feed = worker.message_bus.spider_feed() + self.spider_feed_producer = self.spider_feed.producer() + + self.get_key_function = self.get_fingerprint + if settings.get('QUEUE_HOSTNAME_PARTITIONING'): + self.get_key_function = self.get_hostname + + self.domains_blacklist = settings.get('DOMAINS_BLACKLIST') + self.max_next_requests = settings.MAX_NEXT_REQUESTS + # create an event to disable/enable batches generation via RPC + self.disabled_event = threading.Event() + + def run(self): + if self.disabled_event.is_set(): + return + + partitions = self.spider_feed.available_partitions() + if not partitions: + return + self.logger.info("Getting new batches for partitions %s", + str(",").join(map(str, partitions))) + + count = 0 + for request in self.backend.get_next_requests(self.max_next_requests, + partitions=partitions): + if self._is_domain_blacklisted(request): + continue + try: + request.meta[b'jid'] = self.worker.job_id + eo = self.worker._encoder.encode_request(request) + except Exception as e: + self.logger.error("Encoding error, %s, fingerprint: %s, url: %s" % + (e, self.get_fingerprint(request), request.url)) + continue + else: + self.spider_feed_producer.send(self.get_key_function(request), eo) + finally: + count += 1 + + self.update_stats(increments={'pushed_since_start': count, 'batches_after_start': 1}, + replacements={'last_batch_size': count, + 'last_batch_generated': asctime()}) + + def _is_domain_blacklisted(self, request): + if not self.domains_blacklist: + return + if 'domain' in request.meta: + hostname = request.meta['domain'].get('name') + else: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if hostname: + hostname = hostname.lower() + if hostname in self.domains_blacklist: + self.logger.debug("Dropping black-listed hostname, URL %s", request.url) + return True + return False + + def close(self): + self.spider_feed_producer.close() + + # --------------------------- Auxiliary tools -------------------------------- + + def get_fingerprint(self, request): + return request.meta[b'fingerprint'] + + def get_hostname(self, request): + try: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + except Exception as e: + self.logger.error("URL parsing error %s, fingerprint %s, url %s" % + (e, request.meta[b'fingerprint'], request.url)) + else: + return hostname.encode('utf-8', 'ignore') diff --git a/frontera/worker/components/incoming_consumer.py b/frontera/worker/components/incoming_consumer.py new file mode 100644 index 000000000..be7d4abec --- /dev/null +++ b/frontera/worker/components/incoming_consumer.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +from time import asctime +from collections import defaultdict + +from frontera.exceptions import NotConfigured +from . import DBWorkerComponent + + +class IncomingConsumer(DBWorkerComponent): + """Component to get data from spider log and handle it with backend.""" + + NAME = 'incoming' + + def __init__(self, worker, settings, stop_event, no_incoming=False, **kwargs): + super(IncomingConsumer, self).__init__(worker, settings, stop_event, **kwargs) + if no_incoming: + raise NotConfigured('IncomingConsumer is disabled with --no-incoming') + + spider_log = worker.message_bus.spider_log() + self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') + self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') + + # spider-feed is required only to handle 'offset' messages: + # check lag to decide if mark feed producer as busy or ready + # XXX not implemented for kafka message bus + self.spider_feed = worker.message_bus.spider_feed() + self.spider_feed_producer = self.spider_feed.producer() + + self.backend = worker.backend + self.max_next_requests = settings.MAX_NEXT_REQUESTS + + def run(self): + consumed, stats = 0, defaultdict(int) + for m in self.spider_log_consumer.get_messages( + timeout=1.0, count=self.spider_log_consumer_batch_size): + try: + msg = self.worker._decoder.decode(m) + except (KeyError, TypeError) as e: + self.logger.error("Decoding error: %s", e) + else: + self._handle_message(msg, stats) + finally: + consumed += 1 + """ + # TODO: Think how it should be implemented in DB-worker only mode. + if not self.strategy_disabled and self._backend.finished(): + logger.info("Crawling is finished.") + reactor.stop() + """ + stats_increments = {'consumed_since_start': consumed} + stats_increments.update(stats) + self.update_stats(increments=stats_increments, + replacements={'last_consumed': consumed, + 'last_consumption_run': asctime()}) + + def _handle_message(self, msg, stats): + """Base logic to safely handle a message.""" + try: + self._handle_message_by_type(msg[0], msg, stats) + except Exception as exc: + self.logger.exception("Error while handling a message") + self.logger.debug("Message caused the error %s", str(msg)) + + def _handle_message_by_type(self, msg_type, msg, stats): + if msg_type == 'add_seeds': + _, seeds = msg + self.logger.info('Adding %i seeds', len(seeds)) + for seed in seeds: + self.logger.debug('URL: %s', seed.url) + self.backend.add_seeds(seeds) + stats['consumed_add_seeds'] += 1 + + elif msg_type == 'page_crawled': + _, response = msg + self.logger.debug("Page crawled %s", response.url) + if b'jid' not in response.meta or response.meta[b'jid'] != self.worker.job_id: + return + self.backend.page_crawled(response) + stats['consumed_page_crawled'] += 1 + + elif msg_type == 'links_extracted': + _, request, links = msg + self.logger.debug("Links extracted %s (%d)", request.url, len(links)) + if b'jid' not in request.meta or request.meta[b'jid'] != self.worker.job_id: + return + self.backend.links_extracted(request, links) + stats['consumed_links_extracted'] += 1 + + elif msg_type == 'request_error': + _, request, error = msg + self.logger.debug("Request error %s", request.url) + if b'jid' not in request.meta or request.meta[b'jid'] != self.worker.job_id: + return + self.backend.request_error(request, error) + stats['consumed_request_error'] += 1 + + elif msg_type == 'offset': + _, partition_id, offset = msg + producer_offset = self.spider_feed_producer.get_offset(partition_id) + if producer_offset is None: + return + else: + lag = producer_offset - offset + if lag < 0: + # non-sense in general, happens when SW is restarted and + # not synced yet with Spiders. + return + if lag < self.max_next_requests or offset == 0: + self.spider_feed.mark_ready(partition_id) + else: + self.spider_feed.mark_busy(partition_id) + stats['consumed_offset'] += 1 + + else: + self.logger.debug('Unknown message type %s', msg[0]) + + def close(self): + self.spider_feed_producer.close() + self.spider_log_consumer.close() diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py new file mode 100644 index 000000000..a86bbf4bd --- /dev/null +++ b/frontera/worker/components/scoring_consumer.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +from time import asctime + +from frontera.exceptions import NotConfigured +from frontera.core.components import DistributedBackend +from . import DBWorkerComponent + + +class ScoringConsumer(DBWorkerComponent): + """Component to get data from scoring log and send it to backend queue.""" + + NAME = 'scoring' + + def __init__(self, worker, settings, stop_event, no_scoring=False, **kwargs): + super(ScoringConsumer, self).__init__(worker, settings, stop_event, **kwargs) + if no_scoring: + raise NotConfigured('ScoringConsumer is disabled with --no-scoring') + if not isinstance(worker.backend, DistributedBackend): + raise NotConfigured('Strategy is disabled for non-distributed backend') + + scoring_log = worker.message_bus.scoring_log() + self.scoring_log_consumer = scoring_log.consumer() + self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') + self.backend_queue = worker.backend.queue + + def run(self): + consumed, seen, batch = 0, set(), [] + for m in self.scoring_log_consumer.get_messages( + count=self.scoring_log_consumer_batch_size): + try: + msg = self.worker._decoder.decode(m) + except (KeyError, TypeError) as e: + self.logger.error("Decoding error: %s", e) + continue + else: + if msg[0] == 'update_score': + _, request, score, schedule = msg + if request.meta[b'fingerprint'] not in seen: + batch.append((request.meta[b'fingerprint'], + score, request, schedule)) + seen.add(request.meta[b'fingerprint']) + elif msg[0] == 'new_job_id': + self.worker.job_id = msg[1] + finally: + consumed += 1 + self.backend_queue.schedule(batch) + self.update_stats(increments={'consumed_scoring_since_start': consumed}, + replacements={'last_consumed_scoring': consumed, + 'last_consumption_run_scoring': asctime()}) + + def close(self): + self.scoring_log_consumer.close() diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 09e846029..da5324ebc 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -2,137 +2,146 @@ from __future__ import absolute_import import os +import time import logging +import threading from traceback import format_stack from signal import signal, SIGUSR1 -from logging.config import fileConfig +from collections import defaultdict from argparse import ArgumentParser -from time import asctime -from os.path import exists +from logging.config import fileConfig -from twisted.internet import reactor, task +import six +from twisted.internet import reactor, task, defer from twisted.internet.defer import Deferred -from frontera.core.components import DistributedBackend -from frontera.core.manager import FrontierManager -from frontera.utils.url import parse_domain_from_url_fast -from frontera.logger.handlers import CONSOLE from frontera.settings import Settings from frontera.utils.misc import load_object -from frontera.utils.async import CallLaterOnce +from frontera.logger.handlers import CONSOLE +from frontera.exceptions import NotConfigured +from frontera.core.manager import FrontierManager +from frontera.worker.server import WorkerJsonRpcService from frontera.utils.ossignal import install_shutdown_handlers - from frontera.worker.stats import StatsExportMixin -from .server import WorkerJsonRpcService -import six -from six.moves import map -logger = logging.getLogger("db-worker") +from .components.incoming_consumer import IncomingConsumer +from .components.scoring_consumer import ScoringConsumer +from .components.batch_generator import BatchGenerator -class Slot(object): - def __init__(self, new_batch, consume_incoming, consume_scoring, no_batches, no_scoring_log, - new_batch_delay, no_spider_log): - self.new_batch = CallLaterOnce(new_batch) - self.new_batch.setErrback(self.error) +ALL_COMPONENTS = [ScoringConsumer, IncomingConsumer, BatchGenerator] +LOGGING_TASK_INTERVAL = 30 + +logger = logging.getLogger("db-worker") - self.consumption = CallLaterOnce(consume_incoming) - self.consumption.setErrback(self.error) - self.scheduling = CallLaterOnce(self.schedule) - self.scheduling.setErrback(self.error) +class Slot(object): + """Slot component to manage worker components. + + Slot is responsible for scheduling all the components, modify its behaviour + and stop them gracefully on worker's discretion. + """ + def __init__(self, worker, settings, **kwargs): + # single event to stop all the components at once + self.stop_event = threading.Event() + self.components = self._load_components(worker, settings, **kwargs) + self._setup_managing_batches() + self._deferred = None + + def _load_components(self, worker, settings, **kwargs): + # each component is stored as (cls, instance) pair + components = {} + for cls in ALL_COMPONENTS: + try: + component = cls(worker, settings, self.stop_event, **kwargs) + except NotConfigured: + logger.info("Component {} is disabled".format(cls.NAME)) + else: + components[cls] = component + if not components: + raise NotConfigured("No components to run, please check your input args") + return components - self.scoring_consumption = CallLaterOnce(consume_scoring) - self.scoring_consumption.setErrback(self.error) + def schedule(self): + components = [component.schedule() for component in self.components.values()] + self._deferred = defer.DeferredList(components) - self.no_batches = no_batches - self.no_scoring_log = no_scoring_log - self.no_spider_log = no_spider_log - self.new_batch_delay = new_batch_delay + def stop(self): + if self._deferred: + # set stop flag and return a defferred connected with all running threads + self.stop_event.set() + return self._deferred - def error(self, f): - logger.exception(f.value) - return f + # Additional functions to manage specific components - def schedule(self, on_start=False): - if on_start and not self.no_batches: - self.new_batch.schedule(0) + # XXX do we actually use this feature to disable/enable new batches? + # it should be easier to just stop the batchgen component and start it again when needed - if not self.no_spider_log: - self.consumption.schedule() - if not self.no_batches: - self.new_batch.schedule(self.new_batch_delay) - if not self.no_scoring_log: - self.scoring_consumption.schedule() - self.scheduling.schedule(5.0) + def _setup_managing_batches(self): + """Save batch-gen specific event to disable/enable it via RPC calls.""" + batchgen = self.components.get(BatchGenerator) + self.batches_disabled_event = batchgen.disabled_event if batchgen else None - def cancel(self): - self.scheduling.cancel() - self.scoring_consumption.cancel() - self.new_batch.cancel() - self.consumption.cancel() + def manage_new_batches(self, enable): + if self.batches_disabled_event: + self.batches_disabled_event.clear() if enable else self.batches_disabled_event.set() class BaseDBWorker(object): """Base database worker class.""" def __init__(self, settings, no_batches, no_incoming, no_scoring): - messagebus = load_object(settings.get('MESSAGE_BUS')) - self.mb = messagebus(settings) - spider_log = self.mb.spider_log() - self.spider_feed = self.mb.spider_feed() - self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') - self.spider_feed_producer = self.spider_feed.producer() + messagebus = load_object(settings.get('MESSAGE_BUS')) + self.message_bus = messagebus(settings) self._manager = FrontierManager.from_settings(settings, db_worker=True) - self._backend = self._manager.backend + self.backend = self._manager.backend + codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) - if isinstance(self._backend, DistributedBackend) and not no_scoring: - scoring_log = self.mb.scoring_log() - self.scoring_log_consumer = scoring_log.consumer() - self.queue = self._backend.queue - self.strategy_disabled = False - else: - self.strategy_disabled = True - self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') - self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' - self.max_next_requests = settings.MAX_NEXT_REQUESTS - self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, - self.strategy_disabled, settings.get('NEW_BATCH_DELAY'), no_incoming) - self.job_id = 0 - self.stats = { - 'consumed_since_start': 0, - 'consumed_scoring_since_start': 0, - 'pushed_since_start': 0, - 'consumed_add_seeds': 0, - 'consumed_page_crawled': 0, - 'consumed_links_extracted': 0, - 'consumed_request_error': 0, - 'consumed_offset': 0 - } - self._logging_task = task.LoopingCall(self.log_status) + slot_kwargs = {'no_batches': no_batches, + 'no_incoming': no_incoming, + 'no_scoring': no_scoring} + self.slot = Slot(self, settings, **slot_kwargs) - def set_process_info(self, process_info): - self.process_info = process_info + self.stats = defaultdict(int) + self._logging_task = task.LoopingCall(self.log_status) def run(self): def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) - self.slot.schedule(on_start=True) - self._logging_task.start(30) + self.slot.schedule() + self._logging_task.start(LOGGING_TASK_INTERVAL) install_shutdown_handlers(self._handle_shutdown) signal(SIGUSR1, debug) reactor.run(installSignalHandlers=False) + # Auxiliary methods + + def update_stats(self, replacements=None, increments=None): + if replacements: + for key, value in replacements.items(): + self.stats[key] = value + if increments: + for key, value in increments.items(): + self.stats[key] += value + + def set_process_info(self, process_info): + self.process_info = process_info + + def log_status(self): + for k, v in six.iteritems(self.stats): + logger.info("%s=%s", k, v) + + # Graceful shutdown + def _handle_shutdown(self, signum, _): def call_shutdown(): d = self.stop_tasks() @@ -144,191 +153,27 @@ def call_shutdown(): def stop_tasks(self): logger.info("Stopping periodic tasks.") self._logging_task.stop() - self.slot.cancel() d = Deferred() + d.addBoth(self._stop_slot) d.addBoth(self._perform_shutdown) d.addBoth(self._stop_reactor) return d - def _stop_reactor(self, _=None): - logger.info("Stopping reactor.") - try: - reactor.stop() - except RuntimeError: # raised if already stopped or in shutdown stage - pass + def _stop_slot(self, _=None): + logger.info("Stopping DB worker slot.") + return self.slot.stop() def _perform_shutdown(self, _=None): logger.info("Stopping frontier manager.") self._manager.stop() - logger.info("Closing message bus.") - if not self.strategy_disabled: - self.scoring_log_consumer.close() - self.spider_feed_producer.close() - self.spider_log_consumer.close() - def log_status(self): - for k, v in six.iteritems(self.stats): - logger.info("%s=%s", k, v) - - def disable_new_batches(self): - self.slot.no_batches = True - - def enable_new_batches(self): - self.slot.no_batches = False - - def consume_incoming(self, *args, **kwargs): - consumed = 0 - for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size): - try: - msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: - logger.error("Decoding error: %s", e) - continue - else: - try: - type = msg[0] - if type == 'add_seeds': - _, seeds = msg - logger.info('Adding %i seeds', len(seeds)) - for seed in seeds: - logger.debug('URL: %s', seed.url) - self._backend.add_seeds(seeds) - self.stats['consumed_add_seeds'] += 1 - continue - if type == 'page_crawled': - _, response = msg - logger.debug("Page crawled %s", response.url) - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: - continue - self._backend.page_crawled(response) - self.stats['consumed_page_crawled'] += 1 - continue - if type == 'links_extracted': - _, request, links = msg - logger.debug("Links extracted %s (%d)", request.url, len(links)) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self._backend.links_extracted(request, links) - self.stats['consumed_links_extracted'] += 1 - continue - if type == 'request_error': - _, request, error = msg - logger.debug("Request error %s", request.url) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self._backend.request_error(request, error) - self.stats['consumed_request_error'] += 1 - continue - if type == 'offset': - _, partition_id, offset = msg - producer_offset = self.spider_feed_producer.get_offset(partition_id) - if producer_offset is None: - continue - else: - lag = producer_offset - offset - if lag < 0: - # non-sense in general, happens when SW is restarted and not synced yet with Spiders. - continue - if lag < self.max_next_requests or offset == 0: - self.spider_feed.mark_ready(partition_id) - else: - self.spider_feed.mark_busy(partition_id) - self.stats['consumed_offset'] += 1 - continue - logger.debug('Unknown message type %s', type) - except Exception as exc: - logger.exception(exc) - if logger.isEnabledFor(logging.DEBUG): - logger.debug("Message caused the error %s", str(msg)) - continue - finally: - consumed += 1 - """ - # TODO: Think how it should be implemented in DB-worker only mode. - if not self.strategy_disabled and self._backend.finished(): - logger.info("Crawling is finished.") + def _stop_reactor(self, _=None): + logger.info("Stopping reactor.") + try: reactor.stop() - """ - self.stats['consumed_since_start'] += consumed - self.stats['last_consumed'] = consumed - self.stats['last_consumption_run'] = asctime() - self.slot.schedule() - return consumed - - def consume_scoring(self, *args, **kwargs): - consumed = 0 - seen = set() - batch = [] - for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size): - try: - msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: - logger.error("Decoding error: %s", e) - continue - else: - if msg[0] == 'update_score': - _, request, score, schedule = msg - if request.meta[b'fingerprint'] not in seen: - batch.append((request.meta[b'fingerprint'], score, request, schedule)) - seen.add(request.meta[b'fingerprint']) - if msg[0] == 'new_job_id': - self.job_id = msg[1] - finally: - consumed += 1 - self.queue.schedule(batch) - - self.stats['consumed_scoring_since_start'] += consumed - self.stats['last_consumed_scoring'] = consumed - self.stats['last_consumption_run_scoring'] = asctime() - self.slot.schedule() - - def new_batch(self, *args, **kwargs): - def get_hostname(request): - try: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) - except Exception as e: - logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'], - request.url)) - return None - else: - return name.encode('utf-8', 'ignore') - - def get_fingerprint(request): - return request.meta[b'fingerprint'] - - partitions = self.spider_feed.available_partitions() - logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) - if not partitions: - return 0 - - count = 0 - if self.spider_feed_partitioning == 'hostname': - get_key = get_hostname - elif self.spider_feed_partitioning == 'fingerprint': - get_key = get_fingerprint - else: - raise Exception("Unexpected value in self.spider_feed_partitioning") - - for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): - try: - request.meta[b'jid'] = self.job_id - eo = self._encoder.encode_request(request) - except Exception as e: - logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, - request.meta[b'fingerprint'], - request.url)) - continue - finally: - count += 1 - self.spider_feed_producer.send(get_key(request), eo) - - self.stats['pushed_since_start'] += count - self.stats['last_batch_size'] = count - self.stats.setdefault('batches_after_start', 0) - self.stats['batches_after_start'] += 1 - self.stats['last_batch_generated'] = asctime() - return count + except RuntimeError: # raised if already stopped or in shutdown stage + pass class DBWorker(StatsExportMixin, BaseDBWorker): @@ -377,7 +222,7 @@ def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") - if logging_config_path and exists(logging_config_path): + if logging_config_path and os.path.exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) diff --git a/frontera/worker/server.py b/frontera/worker/server.py index a77a49bae..74530c4b1 100644 --- a/frontera/worker/server.py +++ b/frontera/worker/server.py @@ -79,9 +79,11 @@ def __init__(self, worker): JsonResource.__init__(self) def render_GET(self, txrequest): + batches_disabled_event = self.worker.slot.batches_disabled_event + disable_new_batches = batches_disabled_event.is_set() if batches_disabled_event else None return { - 'is_finishing': self.worker.slot.is_finishing, - 'disable_new_batches': self.worker.slot.no_batches, + 'is_finishing': self.worker.slot.stop_event.is_set(), + 'disable_new_batches': disable_new_batches, 'stats': self.worker.stats } @@ -116,11 +118,11 @@ def __init__(self, worker): def process_request(self, method, jrequest): if method == 'disable_new_batches': - self.worker.disable_new_batches() + self.worker.slot.manage_new_batches(enable=False) return jsonrpc_result(jrequest['id'], "success") if method == 'enable_new_batches': - self.worker.enable_new_batches() + self.worker.slot.manage_new_batches(enable=True) return jsonrpc_result(jrequest['id'], "success") raise JsonRpcError(400, "Unknown method") @@ -148,7 +150,7 @@ def __init__(self, root, settings): def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() - logger.info('Web service listening on %(host)s:%(port)d'.format(host=h.host, port=h.port)) + logger.info('Web service listening on {host}:{port}'.format(host=h.host, port=h.port)) def stop_listening(self): self.port.stopListening() From ff6c7ddcf117768044c5ae7e7ce55a6fc7bc7c60 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 24 Jul 2017 13:31:43 +0300 Subject: [PATCH 092/273] Fix attribute error related with job_id --- frontera/worker/db.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index da5324ebc..e3fa88739 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -110,6 +110,7 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring): self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) + self.job_id = None self._logging_task = task.LoopingCall(self.log_status) def run(self): From 469c64ab873c589748bf08f9eb371bc6ad2d05fc Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 24 Jul 2017 16:25:45 +0300 Subject: [PATCH 093/273] Run only batchgen via thread --- frontera/worker/components/__init__.py | 65 +++++++++++++++---- frontera/worker/components/batch_generator.py | 6 +- .../worker/components/incoming_consumer.py | 12 ++-- .../worker/components/scoring_consumer.py | 12 ++-- frontera/worker/db.py | 9 ++- 5 files changed, 74 insertions(+), 30 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index 24ddbffa2..90ce314ee 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -5,25 +5,69 @@ import logging import threading +from twisted.internet import reactor, task, threads + from frontera.exceptions import NotConfigured +from frontera.utils.async import CallLaterOnce -from twisted.internet import reactor, task, threads +class DBWorkerBaseComponent(object): + + NAME = None + + def __init__(self, worker, settings): + self.worker = worker + self.settings = settings + self.logger = logging.getLogger('db-worker.{}'.format(self.NAME)) + + def schedule(self, delay=0): + """Schedule component start with optional delay. + The function must return None or Deferred. + """ + raise NotImplementedError + + def run(self): + """Iteration logic, must be implemented in a subclass.""" + raise NotImplementedError + + def stop(self): + """Optional stop logic called by the reactor thread.""" -class DBWorkerComponent(object): - """Base class for DB worker component. + +class DBWorkerPeriodicComponent(DBWorkerBaseComponent): + + def __init__(self, worker, settings, *args, **kwargs): + super(DBWorkerPeriodicComponent, self).__init__(worker, settings) + self.periodic_task = CallLaterOnce(self.run_with_callback) + self.periodic_task.setErrback(self.run_errback) + + def schedule(self, delay=0): + self.logger.info('Periodic scheduled!') + self.periodic_task.schedule(delay) + + def run_with_callback(self): + self.logger.info('Run with callback!') + self.run() + self.periodic_task.schedule() + self.logger.info('Scheduled again!') + + def run_errback(self, failure): + self.logger.exception(failure.value) + self.periodic_task.schedule() + + def stop(self): + self.periodic_task.cancel() + + +class DBWorkerThreadComponent(DBWorkerBaseComponent): + """Base class for DB worker component running in a separate thread. The class defines a single interface for DB worker components: you should mainly implement only .run() method representing a single component iteration. """ - NAME = None - def __init__(self, worker, settings, stop_event, *args, **kwargs): - self.worker = worker - self.settings = settings self.stop_event = stop_event - self.logger = logging.getLogger('db-worker.{}'.format(self.NAME)) self.run_backoff = 0 # replace it with a proper value in subclass def schedule(self): @@ -32,6 +76,7 @@ def schedule(self): def loop(self): """Main entrypoint for the thread running loop.""" while not self.stop_event.is_set(): + self.logger.info('Thread iteration!') try: self.run() except Exception as exc: @@ -41,7 +86,6 @@ def loop(self): .format(self.run_backoff)) time.sleep(self.run_backoff) self.logger.debug("Main loop was stopped") - self.close() def run(self): """Logic for single iteration of the component.""" @@ -50,6 +94,3 @@ def run(self): def update_stats(self, **kwargs): """Helper to update worker stats.""" threads.blockingCallFromThread(reactor, self.worker.update_stats, **kwargs) - - def close(self): - """Optional method to do some clean-up before exiting main loop.""" diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 9347ac878..20b6fab36 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -8,10 +8,10 @@ from frontera.exceptions import NotConfigured from frontera.utils.url import parse_domain_from_url_fast -from . import DBWorkerComponent +from . import DBWorkerThreadComponent -class BatchGenerator(DBWorkerComponent): +class BatchGenerator(DBWorkerThreadComponent): """Component to get data from backend and send it to spider feed log.""" NAME = 'batchgen' @@ -80,7 +80,7 @@ def _is_domain_blacklisted(self, request): return True return False - def close(self): + def stop(self): self.spider_feed_producer.close() # --------------------------- Auxiliary tools -------------------------------- diff --git a/frontera/worker/components/incoming_consumer.py b/frontera/worker/components/incoming_consumer.py index be7d4abec..174140978 100644 --- a/frontera/worker/components/incoming_consumer.py +++ b/frontera/worker/components/incoming_consumer.py @@ -5,10 +5,10 @@ from collections import defaultdict from frontera.exceptions import NotConfigured -from . import DBWorkerComponent +from . import DBWorkerPeriodicComponent -class IncomingConsumer(DBWorkerComponent): +class IncomingConsumer(DBWorkerPeriodicComponent): """Component to get data from spider log and handle it with backend.""" NAME = 'incoming' @@ -51,9 +51,9 @@ def run(self): """ stats_increments = {'consumed_since_start': consumed} stats_increments.update(stats) - self.update_stats(increments=stats_increments, - replacements={'last_consumed': consumed, - 'last_consumption_run': asctime()}) + self.worker.update_stats(increments=stats_increments, + replacements={'last_consumed': consumed, + 'last_consumption_run': asctime()}) def _handle_message(self, msg, stats): """Base logic to safely handle a message.""" @@ -116,6 +116,6 @@ def _handle_message_by_type(self, msg_type, msg, stats): else: self.logger.debug('Unknown message type %s', msg[0]) - def close(self): + def stop(self): self.spider_feed_producer.close() self.spider_log_consumer.close() diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py index a86bbf4bd..5d7d702f1 100644 --- a/frontera/worker/components/scoring_consumer.py +++ b/frontera/worker/components/scoring_consumer.py @@ -5,10 +5,10 @@ from frontera.exceptions import NotConfigured from frontera.core.components import DistributedBackend -from . import DBWorkerComponent +from . import DBWorkerPeriodicComponent -class ScoringConsumer(DBWorkerComponent): +class ScoringConsumer(DBWorkerPeriodicComponent): """Component to get data from scoring log and send it to backend queue.""" NAME = 'scoring' @@ -46,9 +46,9 @@ def run(self): finally: consumed += 1 self.backend_queue.schedule(batch) - self.update_stats(increments={'consumed_scoring_since_start': consumed}, - replacements={'last_consumed_scoring': consumed, - 'last_consumption_run_scoring': asctime()}) + self.worker.update_stats(increments={'consumed_scoring_since_start': consumed}, + replacements={'last_consumed_scoring': consumed, + 'last_consumption_run_scoring': asctime()}) - def close(self): + def stop(self): self.scoring_log_consumer.close() diff --git a/frontera/worker/db.py b/frontera/worker/db.py index e3fa88739..ef7507382 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -53,7 +53,7 @@ def _load_components(self, worker, settings, **kwargs): components = {} for cls in ALL_COMPONENTS: try: - component = cls(worker, settings, self.stop_event, **kwargs) + component = cls(worker, settings, stop_event=self.stop_event, **kwargs) except NotConfigured: logger.info("Component {} is disabled".format(cls.NAME)) else: @@ -63,10 +63,13 @@ def _load_components(self, worker, settings, **kwargs): return components def schedule(self): - components = [component.schedule() for component in self.components.values()] - self._deferred = defer.DeferredList(components) + # component.schedule() function must return None or Deferred + deferred = filter(None, (component.schedule() for component in self.components.values())) + self._deferred = defer.DeferredList(deferred) def stop(self): + for component in self.components.values(): + component.stop() if self._deferred: # set stop flag and return a defferred connected with all running threads self.stop_event.set() From 5d7b1e6d673674e89746e589934c623c2633afd3 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 24 Jul 2017 16:57:09 +0300 Subject: [PATCH 094/273] Polish changes and fix minor issues --- frontera/worker/components/__init__.py | 34 +++++++++---------- frontera/worker/components/batch_generator.py | 2 +- .../worker/components/incoming_consumer.py | 2 +- .../worker/components/scoring_consumer.py | 2 +- frontera/worker/db.py | 18 +++++++--- 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index 90ce314ee..85734b2d7 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -15,9 +15,10 @@ class DBWorkerBaseComponent(object): NAME = None - def __init__(self, worker, settings): + def __init__(self, worker, settings, stop_event): self.worker = worker self.settings = settings + self.stop_event = stop_event self.logger = logging.getLogger('db-worker.{}'.format(self.NAME)) def schedule(self, delay=0): @@ -30,33 +31,33 @@ def run(self): """Iteration logic, must be implemented in a subclass.""" raise NotImplementedError - def stop(self): - """Optional stop logic called by the reactor thread.""" + def close(self): + """Optional cleanup logic when component loop is stopped.""" class DBWorkerPeriodicComponent(DBWorkerBaseComponent): - def __init__(self, worker, settings, *args, **kwargs): - super(DBWorkerPeriodicComponent, self).__init__(worker, settings) - self.periodic_task = CallLaterOnce(self.run_with_callback) + def __init__(self, worker, settings, stop_event, *args, **kwargs): + super(DBWorkerPeriodicComponent, self).__init__(worker, settings, stop_event) + self.periodic_task = CallLaterOnce(self.run_and_reschedule) self.periodic_task.setErrback(self.run_errback) def schedule(self, delay=0): - self.logger.info('Periodic scheduled!') self.periodic_task.schedule(delay) - def run_with_callback(self): - self.logger.info('Run with callback!') - self.run() - self.periodic_task.schedule() - self.logger.info('Scheduled again!') + def run_and_reschedule(self): + if not self.stopped: + self.run() + self.periodic_task.schedule() def run_errback(self, failure): self.logger.exception(failure.value) - self.periodic_task.schedule() + if not self.stopped: + self.periodic_task.schedule() - def stop(self): - self.periodic_task.cancel() + @property + def stopped(self): + return self.stop_event.is_set() class DBWorkerThreadComponent(DBWorkerBaseComponent): @@ -67,7 +68,7 @@ class DBWorkerThreadComponent(DBWorkerBaseComponent): """ def __init__(self, worker, settings, stop_event, *args, **kwargs): - self.stop_event = stop_event + super(DBWorkerThreadComponent, self).__init__(worker, settings, stop_event) self.run_backoff = 0 # replace it with a proper value in subclass def schedule(self): @@ -76,7 +77,6 @@ def schedule(self): def loop(self): """Main entrypoint for the thread running loop.""" while not self.stop_event.is_set(): - self.logger.info('Thread iteration!') try: self.run() except Exception as exc: diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 20b6fab36..36fb2f3d0 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -80,7 +80,7 @@ def _is_domain_blacklisted(self, request): return True return False - def stop(self): + def close(self): self.spider_feed_producer.close() # --------------------------- Auxiliary tools -------------------------------- diff --git a/frontera/worker/components/incoming_consumer.py b/frontera/worker/components/incoming_consumer.py index 174140978..98ad19527 100644 --- a/frontera/worker/components/incoming_consumer.py +++ b/frontera/worker/components/incoming_consumer.py @@ -116,6 +116,6 @@ def _handle_message_by_type(self, msg_type, msg, stats): else: self.logger.debug('Unknown message type %s', msg[0]) - def stop(self): + def close(self): self.spider_feed_producer.close() self.spider_log_consumer.close() diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py index 5d7d702f1..4ccb6b0a9 100644 --- a/frontera/worker/components/scoring_consumer.py +++ b/frontera/worker/components/scoring_consumer.py @@ -50,5 +50,5 @@ def run(self): replacements={'last_consumed_scoring': consumed, 'last_consumption_run_scoring': asctime()}) - def stop(self): + def close(self): self.scoring_log_consumer.close() diff --git a/frontera/worker/db.py b/frontera/worker/db.py index ef7507382..296e63c3a 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -64,17 +64,20 @@ def _load_components(self, worker, settings, **kwargs): def schedule(self): # component.schedule() function must return None or Deferred - deferred = filter(None, (component.schedule() for component in self.components.values())) + scheduled = [component.schedule() for component in self.components.values()] + deferred = [result for result in scheduled if isinstance(result, Deferred)] self._deferred = defer.DeferredList(deferred) def stop(self): - for component in self.components.values(): - component.stop() + """Set stop flag and return a defferred connected with all running threads.""" + self.stop_event.set() if self._deferred: - # set stop flag and return a defferred connected with all running threads - self.stop_event.set() return self._deferred + def close(self): + for component in self.components.values(): + component.close() + # Additional functions to manage specific components # XXX do we actually use this feature to disable/enable new batches? @@ -160,6 +163,7 @@ def stop_tasks(self): d = Deferred() d.addBoth(self._stop_slot) + d.addBoth(self._close_slot) d.addBoth(self._perform_shutdown) d.addBoth(self._stop_reactor) return d @@ -168,6 +172,10 @@ def _stop_slot(self, _=None): logger.info("Stopping DB worker slot.") return self.slot.stop() + def _close_slot(self, _=None): + logger.info('Closing DB worker slot resources.') + self.slot.close() + def _perform_shutdown(self, _=None): logger.info("Stopping frontier manager.") self._manager.stop() From 198e3b9b9e91a9e4133b728ae1aff98f758f8d62 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 24 Jul 2017 17:10:05 +0300 Subject: [PATCH 095/273] Minor code simplification --- frontera/worker/db.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 296e63c3a..b9c2f5104 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -66,13 +66,12 @@ def schedule(self): # component.schedule() function must return None or Deferred scheduled = [component.schedule() for component in self.components.values()] deferred = [result for result in scheduled if isinstance(result, Deferred)] - self._deferred = defer.DeferredList(deferred) + self._deferred = defer.DeferredList(deferred) if deferred else None def stop(self): """Set stop flag and return a defferred connected with all running threads.""" self.stop_event.set() - if self._deferred: - return self._deferred + return self._deferred if self._deferred else None def close(self): for component in self.components.values(): From 5923e2124c878a39f82210e09e4594313f5abc53 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 24 Jul 2017 22:38:50 +0300 Subject: [PATCH 096/273] Initial job_id must be 0 for db worker --- frontera/worker/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index b9c2f5104..c90512ecf 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -115,7 +115,7 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring): self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) - self.job_id = None + self.job_id = 0 self._logging_task = task.LoopingCall(self.log_status) def run(self): From d14c4c8284936e2e6a7631e085028c7392ba879a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 26 Jul 2017 11:13:24 +0300 Subject: [PATCH 097/273] Smarted backoff for batchgen component --- frontera/worker/components/__init__.py | 16 ++++++++++------ frontera/worker/components/batch_generator.py | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index 85734b2d7..024f03ebf 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -78,17 +78,21 @@ def loop(self): """Main entrypoint for the thread running loop.""" while not self.stop_event.is_set(): try: - self.run() + is_backoff_needed = self.run() except Exception as exc: self.logger.exception('Exception in the main loop') - if self.run_backoff: - self.logger.debug('Sleep for {} seconds before next run()' - .format(self.run_backoff)) - time.sleep(self.run_backoff) + else: + if is_backoff_needed and self.run_backoff: + delay_msg = 'Sleep for {} seconds before next run()' + self.logger.debug(delay_msg.format(self.run_backoff)) + time.sleep(self.run_backoff) self.logger.debug("Main loop was stopped") def run(self): - """Logic for single iteration of the component.""" + """Logic for single iteration of the component. + + The method must return True-ish value if backoff is needed between iteration. + """ raise NotImplementedError def update_stats(self, **kwargs): diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 36fb2f3d0..624f42d5f 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -37,11 +37,11 @@ def __init__(self, worker, settings, stop_event, no_batches=False, **kwargs): def run(self): if self.disabled_event.is_set(): - return + return True partitions = self.spider_feed.available_partitions() if not partitions: - return + return True self.logger.info("Getting new batches for partitions %s", str(",").join(map(str, partitions))) From f6ed24f8420dfd1a0d83cc7287cd97ca23f11f5c Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 26 Jul 2017 11:31:48 +0300 Subject: [PATCH 098/273] Do back-off if no requests to schedule --- frontera/worker/components/batch_generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 624f42d5f..2486d6122 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -61,7 +61,8 @@ def run(self): self.spider_feed_producer.send(self.get_key_function(request), eo) finally: count += 1 - + if not count: + return True self.update_stats(increments={'pushed_since_start': count, 'batches_after_start': 1}, replacements={'last_batch_size': count, 'last_batch_generated': asctime()}) From deb3617b2003c7f7460b90bfde876d6edaf5ab27 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 24 Jul 2017 17:35:00 +0500 Subject: [PATCH 099/273] max_request_size is set to 4Mb --- frontera/contrib/messagebus/kafkabus.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index cd53ed76f..0529848fe 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -116,6 +116,7 @@ def _create(self, enable_ssl, cert_path): self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, compression_type=self._compression, + max_request_size=4 * 1024 * 1024, **kwargs) def send(self, key, *messages): @@ -140,6 +141,7 @@ def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, com partitioner=partitioner, retries=5, compression_type=self._compression, + max_request_size=4 * 1024 * 1024, **kwargs) def send(self, key, *messages): From d5b1cc7d9bdda51c257e7475533b2ce51c5b28dd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 3 Aug 2017 11:45:43 +0300 Subject: [PATCH 100/273] Update kafkabus producers settings --- frontera/contrib/messagebus/kafkabus.py | 34 ++++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 0529848fe..50b7416ff 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -16,6 +16,8 @@ from os.path import join as os_path_join +DEFAULT_MAX_REQUEST_SIZE = 4 * 1024 * 1024 + logger = getLogger("messagebus.kafka") @@ -105,18 +107,19 @@ def close(self): class SimpleProducer(BaseStreamProducer): - def __init__(self, location, enable_ssl, cert_path, topic, compression): + def __init__(self, location, enable_ssl, cert_path, topic, compression, **kwargs): self._location = location self._topic = topic self._compression = compression - self._create(enable_ssl, cert_path) + self._create(enable_ssl, cert_path, **kwargs) - def _create(self, enable_ssl, cert_path): - kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} + def _create(self, enable_ssl, cert_path, **kwargs): + max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE) + kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}) self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, compression_type=self._compression, - max_request_size=4 * 1024 * 1024, + max_request_size=max_request_size, **kwargs) def send(self, key, *messages): @@ -131,17 +134,18 @@ def close(self): class KeyedProducer(BaseStreamProducer): - def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, compression): + def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, compression, **kwargs): self._location = location self._topic_done = topic_done self._partitioner = partitioner self._compression = compression - kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} + max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE) + kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}) self._producer = KafkaProducer(bootstrap_servers=self._location, partitioner=partitioner, retries=5, compression_type=self._compression, - max_request_size=4 * 1024 * 1024, + max_request_size=max_request_size, **kwargs) def send(self, key, *messages): @@ -168,7 +172,9 @@ def __init__(self, messagebus): def producer(self): return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, - FingerprintPartitioner(self._partitions), self._codec) + FingerprintPartitioner(self._partitions), self._codec, + batch_size=1024 * 1024, + buffer_memory=130 * 1024 * 1024) def consumer(self, partition_id, type): """ @@ -219,7 +225,9 @@ def available_partitions(self): def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) - return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec) + return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec, + batch_size=1024 * 1024, + buffer_memory=130 * 1024 * 1024) class ScoringLogStream(BaseScoringLogStream): @@ -235,7 +243,9 @@ def consumer(self): return Consumer(self._location, self._enable_ssl, self._cert_path, self._topic, self._group, partition_id=None) def producer(self): - return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec) + return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec, + batch_size=1024 * 1024, + buffer_memory=130 * 1024 * 1024) class StatsLogStream(BaseStatsLogStream, ScoringLogStream): @@ -282,4 +292,4 @@ def scoring_log(self): return ScoringLogStream(self) def stats_log(self): - return StatsLogStream(self) \ No newline at end of file + return StatsLogStream(self) From 1b7951af2daa63a2e6c10bd0e9c4231f264f1ffa Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 3 Aug 2017 12:03:01 +0300 Subject: [PATCH 101/273] Move values to constants --- frontera/contrib/messagebus/kafkabus.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 50b7416ff..73b9d903d 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -16,6 +16,8 @@ from os.path import join as os_path_join +DEFAULT_BATCH_SIZE = 1024 * 1024 +DEFAULT_BUFFER_MEMORY = 130 * 1024 * 1024 DEFAULT_MAX_REQUEST_SIZE = 4 * 1024 * 1024 logger = getLogger("messagebus.kafka") @@ -173,8 +175,8 @@ def __init__(self, messagebus): def producer(self): return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, FingerprintPartitioner(self._partitions), self._codec, - batch_size=1024 * 1024, - buffer_memory=130 * 1024 * 1024) + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) def consumer(self, partition_id, type): """ @@ -226,8 +228,8 @@ def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec, - batch_size=1024 * 1024, - buffer_memory=130 * 1024 * 1024) + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) class ScoringLogStream(BaseScoringLogStream): @@ -244,8 +246,8 @@ def consumer(self): def producer(self): return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec, - batch_size=1024 * 1024, - buffer_memory=130 * 1024 * 1024) + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) class StatsLogStream(BaseStatsLogStream, ScoringLogStream): From b0e039e664f8aa93cbf5ec28ec2f15b19075bf4e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 26 Jul 2017 13:09:48 +0300 Subject: [PATCH 102/273] more precise scoring, encoding from scrapy, hash for request # Conflicts: # frontera/contrib/backends/hbase.py --- frontera/contrib/backends/hbase.py | 4 ++-- frontera/contrib/scrapy/converters.py | 21 ++++++++++++++++----- frontera/core/models.py | 4 ++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 1bcdf40a6..0d50a9016 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -202,7 +202,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries = 0 count = 0 prefix = to_bytes('%d_' % partition_id) - # now_ts = int(time()) + now_ts = int(time()) # TODO: figure out how to use filter here, Thrift filter above causes full scan # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: @@ -213,7 +213,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix): # filter=filter + for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix, sorted_columns=True): # filter=filter for cq, buf in six.iteritems(data): if cq == b'f:t': continue diff --git a/frontera/contrib/scrapy/converters.py b/frontera/contrib/scrapy/converters.py index fc013150d..50615c8fb 100644 --- a/frontera/contrib/scrapy/converters.py +++ b/frontera/contrib/scrapy/converters.py @@ -1,6 +1,7 @@ from __future__ import absolute_import from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse +from scrapy.http.response.html import TextResponse from frontera.core.models import Request as FrontierRequest from frontera.core.models import Response as FrontierResponse @@ -83,6 +84,8 @@ def to_frontier(self, scrapy_response): frontier_request.meta[b'scrapy_meta'] = scrapy_response.meta if 'redirect_urls' in scrapy_response.meta: frontier_request.meta[b'redirect_urls'] = scrapy_response.meta['redirect_urls'] + if isinstance(scrapy_response, TextResponse): + frontier_request.meta[b'encoding'] = scrapy_response.encoding del scrapy_response.meta[b'frontier_request'] return FrontierResponse(url=scrapy_response.url, status_code=scrapy_response.status, @@ -92,11 +95,19 @@ def to_frontier(self, scrapy_response): def from_frontier(self, response): """response: Frontier > Scrapy""" - return ScrapyResponse(url=response.url, - status=response.status_code, - headers=response.headers, - body=response.body, - request=self._request_converter.from_frontier(response.request)) + if b'encoding' in response.meta: + return TextResponse(url=response.url, + status=response.status_code, + headers=response.headers, + body=response.body, + request=self._request_converter.from_frontier(response.request), + encoding=response.meta[b'encoding']) + else: + return ScrapyResponse(url=response.url, + status=response.status_code, + headers=response.headers, + body=response.body, + request=self._request_converter.from_frontier(response.request)) def _find_method(obj, func): diff --git a/frontera/core/models.py b/frontera/core/models.py index c1c8de734..480939a4b 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -83,6 +83,10 @@ def __str__(self): str(self.meta), str(self.body[:20]), str(self.cookies), str(self.headers)) + def __hash__(self): + return hash(self.meta[b'fingerprint']) + + __repr__ = __str__ From 66941c93bd13f66d8f4a8d985078583d44d8b3a0 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 6 Apr 2018 14:57:30 +0200 Subject: [PATCH 103/273] changing scrapy test website --- tests/scrapy_spider/spiders/example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scrapy_spider/spiders/example.py b/tests/scrapy_spider/spiders/example.py index 000c7c3b8..74d5d167e 100644 --- a/tests/scrapy_spider/spiders/example.py +++ b/tests/scrapy_spider/spiders/example.py @@ -5,7 +5,7 @@ class MySpider(CrawlSpider): name = 'example' - start_urls = ['http://www.dmoz.org'] + start_urls = ['https://en.wikipedia.org/wiki/Main_Page'] callback_calls = 0 rules = [Rule(LinkExtractor(), From 257ee934ed0b7de518d3d866ef82a588b0ebe150 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 6 Apr 2018 15:16:41 +0200 Subject: [PATCH 104/273] ov buffer test fix --- tests/test_core_overused_buffer.py | 14 +++++++------- tests/test_frontier_manager.py | 6 +++--- tests/test_message_bus_backend.py | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_core_overused_buffer.py b/tests/test_core_overused_buffer.py index 7801f9c3a..5a2ae2b06 100644 --- a/tests/test_core_overused_buffer.py +++ b/tests/test_core_overused_buffer.py @@ -3,16 +3,16 @@ from frontera.core.models import Request from six.moves import range from itertools import cycle -from random import choice, sample +from random import choice from string import ascii_lowercase -r1 = Request('http://www.example.com') -r2 = Request('http://www.example.com/some/') -r3 = Request('htttp://www.example.com/some/page/') -r4 = Request('http://example.com') -r5 = Request('http://example.com/some/page') -r6 = Request('http://example1.com') +r1 = Request('http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) +r2 = Request('http://www.example.com/some/', meta={b'fingerprint': b'9773afd9cb0f4ec3fd09d6d1fe2c742abf0621ec'}) +r3 = Request('htttp://www.example.com/some/page/', meta={b'fingerprint': b'7278fb7612670523a7e3e37d7c38871c73bcb0ea'}) +r4 = Request('http://example.com', meta={b'fingerprint': b'89dce6a446a69d6b9bdc01ac75251e4c322bcdff'}) +r5 = Request('http://example.com/some/page', meta={b'fingerprint':b'9dbd730bdce21e322a12c757753f26bbc95c3779'}) +r6 = Request('http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) class TestOverusedBuffer(object): diff --git a/tests/test_frontier_manager.py b/tests/test_frontier_manager.py index 60d57970e..806154587 100644 --- a/tests/test_frontier_manager.py +++ b/tests/test_frontier_manager.py @@ -5,9 +5,9 @@ from six.moves import range -r1 = Request('http://www.example.com') -r2 = Request('https://www.example.com/some/page') -r3 = Request('http://example1.com') +r1 = Request('http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) +r2 = Request('https://www.example.com/some/page', meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'}) +r3 = Request('http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) class TestFrontierManager(object): diff --git a/tests/test_message_bus_backend.py b/tests/test_message_bus_backend.py index 68278d133..5a0818836 100644 --- a/tests/test_message_bus_backend.py +++ b/tests/test_message_bus_backend.py @@ -6,9 +6,9 @@ from frontera.core.models import Request, Response -r1 = Request('http://www.example.com/', meta={b'domain': {b'fingerprint': b'1'}}) -r2 = Request('http://www.scrapy.org/', meta={b'domain': {b'fingerprint': b'2'}}) -r3 = Request('http://www.test.com/some/page', meta={b'domain': {b'fingerprint': b'3'}}) +r1 = Request('http://www.example.com/', meta={b'domain': {b'fingerprint': b'1'}, b'fingerprint': b'abc'}) +r2 = Request('http://www.scrapy.org/', meta={b'domain': {b'fingerprint': b'2'}, b'fingerprint': b'012'}) +r3 = Request('http://www.test.com/some/page', meta={b'domain': {b'fingerprint': b'3'}, b'fingerprint': b'345'}) class TestMessageBusBackend(unittest.TestCase): From 637012b31b495c7302a137c88dbdd51149f6e3e7 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 9 Apr 2018 17:31:47 +0200 Subject: [PATCH 105/273] making Kafka mb to support latest kafka-python --- frontera/contrib/messagebus/kafkabus.py | 34 +++---------------------- requirements/tests.txt | 2 +- setup.py | 2 +- tests/test_message_bus.py | 9 ++++++- 4 files changed, 14 insertions(+), 33 deletions(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 73b9d903d..9add0d943 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -54,30 +54,11 @@ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): ) if partition_id is not None: - self._partition_ids = [TopicPartition(self._topic, partition_id)] - self._consumer.assign(self._partition_ids) + self._partitions = [TopicPartition(self._topic, partition_id)] + self._consumer.assign(self._partitions) else: - self._partition_ids = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] + self._partitions = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] self._consumer.subscribe(topics=[self._topic]) - if self._consumer._use_consumer_group(): - self._consumer._coordinator.ensure_coordinator_known() - self._consumer._coordinator.ensure_active_group() - - self._consumer._update_fetch_positions(self._partition_ids) - self._start_looping_call() - - def _start_looping_call(self, interval=10): - def errback(failure): - logger.exception(failure.value) - if failure.frames: - logger.critical(str("").join(format_tb(failure.getTracebackObject()))) - self._poll_task.start(interval).addErrback(errback) - - self._poll_task = LoopingCall(self._poll_client) - self._poll_task.start(interval).addErrback(errback) - - def _poll_client(self): - self._consumer._client.poll() def get_messages(self, timeout=0.1, count=1): result = [] @@ -91,20 +72,13 @@ def get_messages(self, timeout=0.1, count=1): return result def get_offset(self, partition_id): - for tp in self._partition_ids: + for tp in self._partitions: if tp.partition == partition_id: return self._consumer.position(tp) raise KeyError("Can't find partition %d", partition_id) def close(self): - self._poll_task.stop() self._consumer.commit() - # getting kafka client event loop running some more and execute commit - tries = 3 - while tries: - self.get_messages() - sleep(2.0) - tries -= 1 self._consumer.close() diff --git a/requirements/tests.txt b/requirements/tests.txt index 82efdbce3..0c63dc53c 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -7,7 +7,7 @@ SQLAlchemy>=1.0.0 cachetools pyzmq msgpack-python>=0.4 -kafka-python>=1.0.0 +kafka-python>=1.4.0 pytest-cov happybase>=1.0.0 mock diff --git a/setup.py b/setup.py index 518e28f23..1c357c7bf 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ 'msgpack-python>=0.4' ], 'kafka': [ - 'kafka-python>=1.0.0' + 'kafka-python>=1.4.0' ], 'distributed': [ 'Twisted' diff --git a/tests/test_message_bus.py b/tests/test_message_bus.py index 9575b5d16..5293c2445 100644 --- a/tests/test_message_bus.py +++ b/tests/test_message_bus.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from frontera.settings import Settings from frontera.contrib.messagebus.zeromq import MessageBus as ZeroMQMessageBus -from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus, Consumer as KafkaConsumer +from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus from frontera.utils.fingerprint import sha1 from kafka import KafkaClient from random import randint @@ -124,6 +124,8 @@ def setUp(self): logger.setLevel(logging.INFO) logger.addHandler(handler) + self.logger = logging.getLogger("tester") + self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") @@ -155,8 +157,11 @@ def setUp(self): # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) + self.logger.debug("init is done") + def tearDown(self): + self.logger.debug("teardown") self.sw_us_p.close() self.db_sf_p.close() self.sp_sl_p.close() @@ -167,12 +172,14 @@ def tearDown(self): self.sp_sf_c.close() def spider_log_activity(self, messages): + self.logger.debug("spider log activity entered") for i in range(0, messages): if i % 2 == 0: self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://helloworld.com/way/to/the/sun/' + b'0') else: self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://way.to.the.sun' + b'0') self.sp_sl_p.flush() + self.logger.debug("spider log activity finished") def spider_feed_activity(self): sf_c = 0 From 3a8e2d6b22d4e0ea79fa7d8c2d6b0761e5298e3e Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Fri, 13 Apr 2018 04:57:35 +0530 Subject: [PATCH 106/273] string formatting corrected in the connection information being logged --- frontera/worker/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/server.py b/frontera/worker/server.py index a77a49bae..cb7137d1a 100644 --- a/frontera/worker/server.py +++ b/frontera/worker/server.py @@ -148,7 +148,7 @@ def __init__(self, root, settings): def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() - logger.info('Web service listening on %(host)s:%(port)d'.format(host=h.host, port=h.port)) + logger.info('Web service listening on {host}:{port}'.format(host=h.host, port=h.port)) def stop_listening(self): self.port.stopListening() From 0908e03265cd9dd76d409d336a74393d50c70049 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Apr 2018 14:47:29 +0200 Subject: [PATCH 107/273] db worker test fix --- frontera/worker/components/__init__.py | 6 +- tests/test_worker_db.py | 86 +++++++++++++++----------- 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index 024f03ebf..cc0bc41fe 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -97,4 +97,8 @@ def run(self): def update_stats(self, **kwargs): """Helper to update worker stats.""" - threads.blockingCallFromThread(reactor, self.worker.update_stats, **kwargs) + if reactor.running: + threads.blockingCallFromThread(reactor, self.worker.update_stats, **kwargs) + else: + # for testing purposes + self.worker.update_stats(**kwargs) diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index 05b91d0c2..dce263072 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -1,5 +1,5 @@ from frontera.core.models import Request, Response -from frontera.worker.db import DBWorker +from frontera.worker.db import DBWorker, ScoringConsumer, IncomingConsumer, BatchGenerator from frontera.settings import Settings from frontera.core.components import States @@ -19,71 +19,83 @@ def dbw_setup(self, distributed=False): settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' - return DBWorker(settings, True, True, False) + return DBWorker(settings, False, False, False) def test_add_seeds(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.seeds]) == set([r.url for r in [r1, r2, r3]]) + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert set([r.url for r in incoming_consumer.backend.seeds]) == set([r.url for r in [r1, r2, r3]]) def test_page_crawled(self): dbw = self.dbw_setup() resp = Response(r1.url, request=r1) msg = dbw._encoder.encode_page_crawled(resp) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.responses]) == set([r1.url]) + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert set([r.url for r in incoming_consumer.backend.responses]) == set([r1.url]) def test_links_extracted(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_links_extracted(r1, [r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.links]) == set([r2.url, r3.url]) + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert set([r.url for r in incoming_consumer.backend.links]) == set([r2.url, r3.url]) def test_request_error(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_request_error(r1, 'error') - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert dbw._backend.errors[0][0].url == r1.url - assert dbw._backend.errors[0][1] == 'error' + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert incoming_consumer.backend.errors[0][0].url == r1.url + assert incoming_consumer.backend.errors[0][1] == 'error' def test_scoring(self): dbw = self.dbw_setup(True) - msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert dbw.new_batch() == 0 + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.run() + assert dbw.stats["last_batch_size"] == 0 msg1 = dbw._encoder.encode_update_score(r1, 0.5, True) msg2 = dbw._encoder.encode_update_score(r3, 0.6, True) - dbw.scoring_log_consumer.put_messages([msg1, msg2]) - dbw.consume_scoring() - assert set([r.url for r in dbw._backend.queue.requests]) == set([r1.url, r3.url]) - assert dbw.new_batch() == 2 + scoring_worker = dbw.slot.components[ScoringConsumer] + scoring_worker.scoring_log_consumer.put_messages([msg1, msg2]) + scoring_worker.run() + assert set([r.url for r in dbw.backend.queue.requests]) == set([r1.url, r3.url]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 2 def test_new_batch(self): dbw = self.dbw_setup(True) - dbw._backend.queue.put_requests([r1, r2, r3]) - assert dbw.new_batch() == 3 - assert set(dbw.spider_feed_producer.messages) == \ + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.backend.queue.put_requests([r1, r2, r3]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 3 + assert set(batch_gen.spider_feed_producer.messages) == \ set([dbw._encoder.encode_request(r) for r in [r1, r2, r3]]) def test_offset(self): dbw = self.dbw_setup(True) + incoming_worker = dbw.slot.components[IncomingConsumer] + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.spider_feed = incoming_worker.spider_feed + batch_gen.spider_feed_producer = incoming_worker.spider_feed_producer msg = dbw._encoder.encode_offset(2, 50) - dbw.spider_log_consumer.put_messages([msg]) - dbw.spider_feed_producer.offset = 100 - dbw.consume_incoming() - assert 2 in dbw.spider_feed.available_partitions() + incoming_worker.spider_log_consumer.put_messages([msg]) + incoming_worker.spider_feed_producer.offset = 100 + incoming_worker.run() + assert 2 in batch_gen.spider_feed.available_partitions() msg1 = dbw._encoder.encode_offset(2, 20) msg2 = dbw._encoder.encode_offset(3, 0) - dbw.spider_log_consumer.put_messages([msg1, msg2]) - dbw.consume_incoming() - assert 3 in dbw.spider_feed.available_partitions() - assert 2 not in dbw.spider_feed.available_partitions() - dbw._backend.queue.put_requests([r1, r2, r3]) - assert dbw.new_batch() == 3 - assert 3 in dbw._backend.partitions + incoming_worker.spider_log_consumer.put_messages([msg1, msg2]) + incoming_worker.run() + assert 3 in batch_gen.spider_feed.available_partitions() + assert 2 not in batch_gen.spider_feed.available_partitions() + batch_gen.backend.queue.put_requests([r1, r2, r3]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 3 + assert 3 in batch_gen.backend.partitions From 786c2cd1a5c23c7eb6e29a0eafc201046c4c0622 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 2 Aug 2017 18:35:56 +0300 Subject: [PATCH 108/273] Setting for splitting partitions b/w batchgens --- frontera/settings/default_settings.py | 1 + frontera/worker/components/batch_generator.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b7d65ac1c..1c20c303b 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -32,6 +32,7 @@ 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', ] NEW_BATCH_DELAY = 30.0 +BATCH_PARTITIONS = None DOMAINS_BLACKLIST = None OVERUSED_SLOT_FACTOR = 5.0 OVERUSED_MAX_PER_KEY = None diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 2486d6122..7d70c2208 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -32,14 +32,20 @@ def __init__(self, worker, settings, stop_event, no_batches=False, **kwargs): self.domains_blacklist = settings.get('DOMAINS_BLACKLIST') self.max_next_requests = settings.MAX_NEXT_REQUESTS + self.partitions = settings.get('BATCH_PARTITIONS') # create an event to disable/enable batches generation via RPC self.disabled_event = threading.Event() + def get_partitions(self): + pending_partitions = self.spider_feed.available_partitions() + if not self.partitions: + return pending_partitions + return list(set(pending_partitions) & set(self.partitions)) + def run(self): if self.disabled_event.is_set(): return True - - partitions = self.spider_feed.available_partitions() + partitions = self.get_partitions() if not partitions: return True self.logger.info("Getting new batches for partitions %s", From 4093cb3a4d37fba352f3ed1b8cd6fd81a3be682e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 2 Aug 2017 19:15:45 +0300 Subject: [PATCH 109/273] conflicts resolved --- frontera/contrib/backends/hbase.py | 22 +++++++++---------- frontera/worker/components/__init__.py | 2 +- frontera/worker/components/batch_generator.py | 19 +++++++++------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 0d50a9016..e035d801f 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -511,19 +511,17 @@ def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): - next_pages = [] self.logger.debug("Querying queue table.") - partitions = set(kwargs.pop('partitions', [])) - for partition_id in range(0, self.queue_partitions): - if partition_id not in partitions: - continue - results = self.queue.get_next_requests(max_next_requests, partition_id, - min_requests=self._min_requests, - min_hosts=self._min_hosts, - max_requests_per_host=self._max_requests_per_host) - next_pages.extend(results) - self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) - return next_pages + for partition_id in set(kwargs.pop('partitions', [])): + count = 0 + for request in self.queue.get_next_requests( + max_next_requests, partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host): + count += 1 + yield request + self.logger.debug("Got %d requests for partition id %d", count, partition_id) def get_stats(self): """Helper to get stats dictionary for the backend. diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index cc0bc41fe..f71cb2b8f 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -98,7 +98,7 @@ def run(self): def update_stats(self, **kwargs): """Helper to update worker stats.""" if reactor.running: - threads.blockingCallFromThread(reactor, self.worker.update_stats, **kwargs) + reactor.callFromThread(self.worker.update_stats, **kwargs) else: # for testing purposes self.worker.update_stats(**kwargs) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 7d70c2208..5db174d54 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -48,12 +48,18 @@ def run(self): partitions = self.get_partitions() if not partitions: return True - self.logger.info("Getting new batches for partitions %s", - str(",").join(map(str, partitions))) + batch_count = sum(self._handle_partition(partition_id) + for partition_id in partitions) + # let's count full batches in the same way as before + self.update_stats(increments={'batches_after_start': 1}, + replacements={'last_batch_size': batch_count, + 'last_batch_generated': asctime()}) + def _handle_partition(self, partition_id): + self.logger.info("Getting new batches for partition %d", partition_id) count = 0 for request in self.backend.get_next_requests(self.max_next_requests, - partitions=partitions): + partitions=[partition_id]): if self._is_domain_blacklisted(request): continue try: @@ -67,11 +73,8 @@ def run(self): self.spider_feed_producer.send(self.get_key_function(request), eo) finally: count += 1 - if not count: - return True - self.update_stats(increments={'pushed_since_start': count, 'batches_after_start': 1}, - replacements={'last_batch_size': count, - 'last_batch_generated': asctime()}) + self.update_stats(increments={'pushed_since_start': count}) + return count def _is_domain_blacklisted(self, request): if not self.domains_blacklist: From 343fe3110551258d93c20b261d909aad40cef0e3 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 2 Aug 2017 19:40:27 +0300 Subject: [PATCH 110/273] Config option to pass partitions --- frontera/worker/components/batch_generator.py | 5 +++-- frontera/worker/db.py | 10 +++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 5db174d54..7300c992e 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -16,7 +16,8 @@ class BatchGenerator(DBWorkerThreadComponent): NAME = 'batchgen' - def __init__(self, worker, settings, stop_event, no_batches=False, **kwargs): + def __init__(self, worker, settings, stop_event, + no_batches=False, partitions=None, **kwargs): super(BatchGenerator, self).__init__(worker, settings, stop_event, **kwargs) if no_batches: raise NotConfigured('BatchGenerator is disabled with --no-batches') @@ -32,7 +33,7 @@ def __init__(self, worker, settings, stop_event, no_batches=False, **kwargs): self.domains_blacklist = settings.get('DOMAINS_BLACKLIST') self.max_next_requests = settings.MAX_NEXT_REQUESTS - self.partitions = settings.get('BATCH_PARTITIONS') + self.partitions = partitions # create an event to disable/enable batches generation via RPC self.disabled_event = threading.Event() diff --git a/frontera/worker/db.py b/frontera/worker/db.py index c90512ecf..9ee988e3c 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -95,7 +95,7 @@ def manage_new_batches(self, enable): class BaseDBWorker(object): """Base database worker class.""" - def __init__(self, settings, no_batches, no_incoming, no_scoring): + def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): messagebus = load_object(settings.get('MESSAGE_BUS')) self.message_bus = messagebus(settings) @@ -112,6 +112,7 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring): slot_kwargs = {'no_batches': no_batches, 'no_incoming': no_incoming, 'no_scoring': no_scoring} + slot_kwargs.update(**kwargs) self.slot = Slot(self, settings, **slot_kwargs) self.stats = defaultdict(int) @@ -193,7 +194,7 @@ class DBWorker(StatsExportMixin, BaseDBWorker): The additional features are provided by using mixin classes: - sending crawl stats to message bus """ - def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): + def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring, **kwargs): if no_batches and no_scoring: db_worker_type = 'linksdb' elif no_batches and no_incoming: @@ -221,6 +222,8 @@ def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): help='Disables spider log processing.') parser.add_argument('--no-scoring', action='store_true', help='Disables scoring log processing.') + parser.add_argument('--partitions', type=int, nargs='*', + help='Optional partitions range for batch generator') parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument('--log-level', '-L', type=str, default='INFO', @@ -240,7 +243,8 @@ def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring): logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring) + worker = DBWorker(settings, args.no_batches, args.no_incoming, + args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run() From b4547c7548687db4d197fb2170e0f2ace36355bb Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 2 Aug 2017 19:46:38 +0300 Subject: [PATCH 111/273] No need new setting --- frontera/settings/default_settings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 1c20c303b..b7d65ac1c 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -32,7 +32,6 @@ 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', ] NEW_BATCH_DELAY = 30.0 -BATCH_PARTITIONS = None DOMAINS_BLACKLIST = None OVERUSED_SLOT_FACTOR = 5.0 OVERUSED_MAX_PER_KEY = None From c44fe0553a26d51d2162f6ce5b40310904ade83a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 3 Aug 2017 10:41:13 +0300 Subject: [PATCH 112/273] Minor renaming --- frontera/worker/components/batch_generator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 7300c992e..e760aa542 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -37,7 +37,7 @@ def __init__(self, worker, settings, stop_event, # create an event to disable/enable batches generation via RPC self.disabled_event = threading.Event() - def get_partitions(self): + def get_ready_partitions(self): pending_partitions = self.spider_feed.available_partitions() if not self.partitions: return pending_partitions @@ -46,11 +46,13 @@ def get_partitions(self): def run(self): if self.disabled_event.is_set(): return True - partitions = self.get_partitions() + partitions = self.get_ready_partitions() if not partitions: return True batch_count = sum(self._handle_partition(partition_id) for partition_id in partitions) + if not batch_count: + return True # let's count full batches in the same way as before self.update_stats(increments={'batches_after_start': 1}, replacements={'last_batch_size': batch_count, From 7a1dee9b71f652429380162d417f5e404f40e377 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 3 Aug 2017 10:49:55 +0300 Subject: [PATCH 113/273] Wrap sending message logic --- frontera/worker/components/batch_generator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 2486d6122..b0370e53c 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -56,9 +56,13 @@ def run(self): except Exception as e: self.logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, self.get_fingerprint(request), request.url)) + count += 1 # counts as a processed request continue - else: + try: self.spider_feed_producer.send(self.get_key_function(request), eo) + except Exception as exc: + self.logger.error("Sending message error, %s, fingerprint: %s, url: %s" % + (exc, self.get_fingerprint(request), request.url)) finally: count += 1 if not count: From eaee5a40d74df66fea2234f9be8ae879300e44c6 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 6 Sep 2017 17:45:53 +0300 Subject: [PATCH 114/273] Catch and print shutdown errors --- frontera/worker/strategy.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 5fb9eb6c7..a249f89e5 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -298,14 +298,17 @@ def _stop_reactor(self, _=None): pass def _perform_shutdown(self, _=None): - self.flush_states() - logger.info("Closing crawling strategy.") - self.strategy.close() - logger.info("Stopping frontier manager.") - self._manager.stop() - logger.info("Closing message bus.") - self.scoring_log_producer.close() - self.consumer.close() + try: + self.flush_states() + logger.info("Closing crawling strategy.") + self.strategy.close() + logger.info("Stopping frontier manager.") + self._manager.stop() + logger.info("Closing message bus.") + self.scoring_log_producer.close() + self.consumer.close() + except: + logger.exception('Error on shutdown') def on_add_seeds(self, seeds): logger.debug('Adding %i seeds', len(seeds)) From 426c1bc8fb4d173b5e4707f824de47329afe5014 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 11 Aug 2017 18:57:19 +0300 Subject: [PATCH 115/273] Non-exhausted generators must be closed manually --- frontera/contrib/backends/hbase.py | 45 +++++++++++++++++------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 0d50a9016..0e9cb8264 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -213,26 +213,33 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - for rk, data in table.scan(limit=int(limit), batch_size=256, row_prefix=prefix, sorted_columns=True): # filter=filter - for cq, buf in six.iteritems(data): - if cq == b'f:t': - continue - stream = BytesIO(buf) - unpacker = Unpacker(stream) - for item in unpacker: - fprint, host_crc32, _, _ = item - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + # XXX pypy hot-fix: non-exhausted generator must be closed manually + # otherwise "finally" piece in table.scan() method won't be executed + # immediately to properly close scanner (http://pypy.org/compat.html) + scan_gen = table.scan(limit=int(limit), batch_size=256, row_prefix=prefix, sorted_columns=True) + try: + for rk, data in scan_gen: + for cq, buf in six.iteritems(data): + if cq == b'f:t': continue - queue[host_crc32].append(fprint) - count += 1 - - if fprint not in meta_map: - meta_map[fprint] = [] - meta_map[fprint].append((rk, item)) - if count > max_n_requests: - break + stream = BytesIO(buf) + unpacker = Unpacker(stream) + for item in unpacker: + fprint, host_crc32, _, _ = item + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + continue + queue[host_crc32].append(fprint) + count += 1 + + if fprint not in meta_map: + meta_map[fprint] = [] + meta_map[fprint].append((rk, item)) + if count > max_n_requests: + break + finally: + scan_gen.close() if min_hosts is not None and len(queue.keys()) < min_hosts: continue From e902ad3170f42d69bd89d253ae0cec86e3270b93 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 21 Aug 2017 14:12:32 +0300 Subject: [PATCH 116/273] conflicts resolved --- frontera/contrib/backends/hbase.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 0d50a9016..2a5663f1c 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -22,7 +22,7 @@ from binascii import hexlify, unhexlify from io import BytesIO from random import choice -from collections import Iterable +from collections import defaultdict, Iterable import logging @@ -81,10 +81,10 @@ def __init__(self, connection, partitions, table_name, drop=False, use_snappy=Fa self.connection.delete_table(self.table_name, disable=True) tables.remove(self.table_name) - schema = {'f': {'max_versions': 1}} - if use_snappy: - schema['f']['compression'] = 'SNAPPY' if self.table_name not in tables: + schema = {'f': {'max_versions': 1}} + if use_snappy: + schema['f']['compression'] = 'SNAPPY' self.connection.create_table(self.table_name, schema) class DumbResponse: @@ -283,6 +283,7 @@ def __init__(self, connection, table_name, cache_size_limit, self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") self._state_cache = LRUCache(maxsize=cache_size_limit) + self._state_cache_stats = defaultdict(int) self._state_batch = self.connection.table( self._table_name).batch(batch_size=write_log_size) @@ -316,6 +317,8 @@ def flush(self): def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] + self._update_cache_stats(hits=len(fingerprints)-len(to_fetch), + misses=len(to_fetch)) if not to_fetch: return self.logger.debug('Fetching %d/%d elements from HBase (cache size %d)', @@ -329,6 +332,15 @@ def fetch(self, fingerprints): state = unpack('>B', cells[b's:state'])[0] self._state_cache[hexlify(key)] = state + def _update_cache_stats(self, hits, misses): + self._state_cache_stats['states.cache.hits'] += hits + self._state_cache_stats['states.cache.misses'] += misses + + def get_stats(self): + stats = self._state_cache_stats.copy() + self._state_cache_stats.clear() + return stats + class HBaseMetadata(Metadata): def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content): @@ -530,5 +542,9 @@ def get_stats(self): For now it provides only HBase client stats. """ + stats = {} with time_elapsed('Call HBase backend get_stats()'): - return self.connection.client.get_stats() + stats.update(self.connection.client.get_stats()) + if self._states: + stats.update(self._states.get_stats()) + return stats From 7cb12c074c257f4bd32a1dab949604dcb4cc7d35 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 24 Aug 2017 11:51:40 +0300 Subject: [PATCH 117/273] Provide SW states cache hits ratio --- frontera/contrib/backends/hbase.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 2a5663f1c..b0efd5af2 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import absolute_import, division from frontera.utils.url import parse_domain_from_url_fast from frontera import DistributedBackend from frontera.core.components import Metadata, Queue, States @@ -333,8 +333,12 @@ def fetch(self, fingerprints): self._state_cache[hexlify(key)] = state def _update_cache_stats(self, hits, misses): - self._state_cache_stats['states.cache.hits'] += hits - self._state_cache_stats['states.cache.misses'] += misses + total_hits = self._state_cache_stats['states.cache.hits'] + hits + total_misses = self._state_cache_stats['states.cache.misses'] + misses + total = total_hits + total_misses + self._state_cache_stats['states.cache.hits'] = total_hits + self._state_cache_stats['states.cache.misses'] = total_misses + self._state_cache_stats['states.cache.ratio'] = total_hits / total if total else 0 def get_stats(self): stats = self._state_cache_stats.copy() From 40a2b786ce8f733eef2b64dd433e15875cdb46fd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 24 Aug 2017 19:16:56 +0300 Subject: [PATCH 118/273] Provide flushed batches count for SW states --- frontera/contrib/backends/hbase.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index b0efd5af2..458b39366 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -283,9 +283,9 @@ def __init__(self, connection, table_name, cache_size_limit, self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") self._state_cache = LRUCache(maxsize=cache_size_limit) - self._state_cache_stats = defaultdict(int) self._state_batch = self.connection.table( self._table_name).batch(batch_size=write_log_size) + self._state_stats = defaultdict(int) tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: @@ -306,6 +306,8 @@ def update_cache(self, objs): self._state_batch.put(unhexlify(fingerprint), prepare_hbase_object(state=state)) # update LRU cache with the state update self._state_cache[fingerprint] = state + self._state_stats['_state_updates'] += 1 + self._update_batch_stats() def set_states(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] @@ -332,17 +334,23 @@ def fetch(self, fingerprints): state = unpack('>B', cells[b's:state'])[0] self._state_cache[hexlify(key)] = state + def _update_batch_stats(self): + new_batches_count, self._state_stats['_state_updates'] = divmod( + self._state_stats['_state_updates'], self._state_batch._batch_size) + self._state_stats['states.batches.sent'] += new_batches_count + def _update_cache_stats(self, hits, misses): - total_hits = self._state_cache_stats['states.cache.hits'] + hits - total_misses = self._state_cache_stats['states.cache.misses'] + misses + total_hits = self._state_stats['states.cache.hits'] + hits + total_misses = self._state_stats['states.cache.misses'] + misses total = total_hits + total_misses - self._state_cache_stats['states.cache.hits'] = total_hits - self._state_cache_stats['states.cache.misses'] = total_misses - self._state_cache_stats['states.cache.ratio'] = total_hits / total if total else 0 + self._state_stats['states.cache.hits'] = total_hits + self._state_stats['states.cache.misses'] = total_misses + self._state_stats['states.cache.ratio'] = total_hits / total if total else 0 def get_stats(self): - stats = self._state_cache_stats.copy() - self._state_cache_stats.clear() + stats = {stat: value for stat, value in self._state_stats.items() + if not stat.startswith('_')} # do not report entries with _-prefix + self._state_stats.clear() return stats From a3762f2aff148881cbdc5089b8a374fe5c84d0a8 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Thu, 24 Aug 2017 20:00:38 +0300 Subject: [PATCH 119/273] Simplify state stats using with a variable --- frontera/contrib/backends/hbase.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 458b39366..b559d4e7b 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -286,6 +286,7 @@ def __init__(self, connection, table_name, cache_size_limit, self._state_batch = self.connection.table( self._table_name).batch(batch_size=write_log_size) self._state_stats = defaultdict(int) + self._state_last_updates = 0 tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: @@ -306,7 +307,7 @@ def update_cache(self, objs): self._state_batch.put(unhexlify(fingerprint), prepare_hbase_object(state=state)) # update LRU cache with the state update self._state_cache[fingerprint] = state - self._state_stats['_state_updates'] += 1 + self._state_last_updates += 1 self._update_batch_stats() def set_states(self, objs): @@ -335,8 +336,8 @@ def fetch(self, fingerprints): self._state_cache[hexlify(key)] = state def _update_batch_stats(self): - new_batches_count, self._state_stats['_state_updates'] = divmod( - self._state_stats['_state_updates'], self._state_batch._batch_size) + new_batches_count, self._state_last_updates = divmod( + self._state_last_updates, self._state_batch._batch_size) self._state_stats['states.batches.sent'] += new_batches_count def _update_cache_stats(self, hits, misses): @@ -348,8 +349,7 @@ def _update_cache_stats(self, hits, misses): self._state_stats['states.cache.ratio'] = total_hits / total if total else 0 def get_stats(self): - stats = {stat: value for stat, value in self._state_stats.items() - if not stat.startswith('_')} # do not report entries with _-prefix + stats = self._state_stats.copy() self._state_stats.clear() return stats From 28139a2e8b495c6c40a84741d55797350724839d Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 6 Sep 2017 10:02:43 +0300 Subject: [PATCH 120/273] Provide stats for SW state cache evictions --- frontera/contrib/backends/hbase.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index b559d4e7b..ab7780379 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -65,6 +65,24 @@ def utcnow_timestamp(): return timegm(d.timetuple()) +class LRUCacheWithStats(LRUCache): + """Extended version of standard LRUCache with counting stats.""" + + EVICTED_STATNAME = 'states.cache.evicted' + + def __init__(self, stats=None, *args, **kwargs): + super(LRUCacheWithStats, self).__init__(*args, **kwargs) + self._stats = stats + if self._stats is not None: + self._stats.setdefault(self.EVICTED_STATNAME, 0) + + def popitem(self): + key, val = super(LRUCacheWithStats, self).popitem() + if self._stats: + self._stats[self.EVICTED_STATNAME] += 1 + return key, val + + class HBaseQueue(Queue): GET_RETRIES = 3 @@ -282,10 +300,11 @@ def __init__(self, connection, table_name, cache_size_limit, self.connection = connection self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") - self._state_cache = LRUCache(maxsize=cache_size_limit) self._state_batch = self.connection.table( self._table_name).batch(batch_size=write_log_size) self._state_stats = defaultdict(int) + self._state_cache = LRUCacheWithStats(maxsize=cache_size_limit, + stats=self._state_stats) self._state_last_updates = 0 tables = set(connection.tables()) From a4d65bb8a2d92ada81c6269f3e968088c4573d8d Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 1 Sep 2017 17:22:28 +0300 Subject: [PATCH 121/273] Skip links-extracted when reached limit for domain Write stats about dropped LE events Add filter_extracted_links method to API --- frontera/worker/stats.py | 2 +- frontera/worker/strategies/__init__.py | 17 ++++++++++++++++- frontera/worker/strategies/bfs.py | 3 +++ frontera/worker/strategy.py | 13 +++++++++++-- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/frontera/worker/stats.py b/frontera/worker/stats.py index b2f20d1ea..8c33f9044 100644 --- a/frontera/worker/stats.py +++ b/frontera/worker/stats.py @@ -16,7 +16,7 @@ class StatsExportMixin(object): you to define your custom logic for get_stats_tags() logic in your child classes to store a dictionary with tags as a part of your metrics. """ - STATS_PREFIXES = ['consumed', 'pushed'] + STATS_PREFIXES = ['consumed', 'pushed', 'dropped'] def __init__(self, settings, *args, **kwargs): super(StatsExportMixin, self).__init__(settings, *args, **kwargs) diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 1b46f5d96..a1d6006a7 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -51,10 +51,25 @@ def page_crawled(self, response): :param object response: The :class:`Response ` object for the crawled page. """ + @abstractmethod + def filter_extracted_links(self, request, links): + """ + Called every time on receiving links_extracted event by strategy worker. This call is preceding the call + to links_extracted handler and is aiming to filter unused links and return only those where states + information is needed. + + :param object request: The :class:`Request ` object for the crawled page. + :param list links: A list of :class:`Request ` objects generated from \ + the links extracted for the crawled page. + + :return: A subset of :class:`Request ` input objects. + """ + @abstractmethod def links_extracted(self, request, links): """ - Called every time document was successfully crawled, and receiving page_crawled event from spider log. + Called every time document was successfully crawled, and receiving links_extracted event from spider log, + after the links states are fetched from backend. Should be used to schedule links according to some rules. :param object request: The :class:`Request ` object for the crawled page. :param list links: A list of :class:`Request ` objects generated from \ diff --git a/frontera/worker/strategies/bfs.py b/frontera/worker/strategies/bfs.py index 838498d7f..6e9b20e00 100644 --- a/frontera/worker/strategies/bfs.py +++ b/frontera/worker/strategies/bfs.py @@ -16,6 +16,9 @@ def add_seeds(self, seeds): def page_crawled(self, response): response.meta[b'state'] = States.CRAWLED + def filter_extracted_links(self, request, links): + return links + def links_extracted(self, request, links): for link in links: if link.meta[b'state'] is States.NOT_CRAWLED: diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 5fb9eb6c7..9379a4d6c 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -117,7 +117,8 @@ def __init__(self, settings, strategy_class): 'consumed_add_seeds': 0, 'consumed_page_crawled': 0, 'consumed_links_extracted': 0, - 'consumed_request_error': 0 + 'consumed_request_error': 0, + 'dropped_links_extracted': 0, } self.job_id = 0 self.task = LoopingCall(self.work) @@ -158,7 +159,15 @@ def collect_batch(self): if type == 'links_extracted': _, request, links = msg self.states_context.to_fetch(request) - self.states_context.to_fetch(links) + filtered_links = self.strategy.filter_extracted_links(request, links) + if filtered_links: + # modify last message with a new links list + batch[-1] = (type, request, filtered_links) + self.states_context.to_fetch(filtered_links) + else: + # drop last message if nothing to process + batch.pop() + self.stats['dropped_links_extracted'] += 1 continue if type == 'request_error': _, request, error = msg From c70b0ad3349a4a9ae0a2b382d58d2479a5e7ecaf Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 13 Sep 2017 09:35:49 +0200 Subject: [PATCH 122/273] no need to flush on every batch --- frontera/worker/strategy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 5fb9eb6c7..0d2a2ce12 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -46,9 +46,7 @@ def send(self, request, score=1.0, dont_queue=False): self.flush() def flush(self): - if self._buffer: - self._producer.send(None, *self._buffer) - self._buffer = [] + pass class StatesContext(object): From c3ce7cc8b89a638c51fb23c4a21cc2bfd3f30551 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 5 Oct 2017 16:48:08 +0200 Subject: [PATCH 123/273] create_request method added to manager, etc --- frontera/contrib/canonicalsolvers/basic.py | 3 +++ frontera/contrib/middlewares/domain.py | 9 +++++--- frontera/contrib/middlewares/fingerprint.py | 3 +++ frontera/core/components.py | 9 ++++++++ frontera/core/manager.py | 25 +++++++++++++++++++-- frontera/worker/strategies/__init__.py | 6 ++--- 6 files changed, 46 insertions(+), 9 deletions(-) diff --git a/frontera/contrib/canonicalsolvers/basic.py b/frontera/contrib/canonicalsolvers/basic.py index 944d8c6c1..1909fdf0a 100644 --- a/frontera/contrib/canonicalsolvers/basic.py +++ b/frontera/contrib/canonicalsolvers/basic.py @@ -31,6 +31,9 @@ def links_extracted(self, request, links): def request_error(self, page, error): self._set_canonical(page) + def create_request(self, request): + self._set_canonical(request) + def _set_canonical(self, obj): if b'redirect_urls' in obj.meta: redirect_urls = obj.meta[b'redirect_urls'] diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index c8a0f117a..7ea665a3a 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -90,14 +90,17 @@ def links_extracted(self, request, links): def request_error(self, request, error): return self._add_domain(request) + def create_request(self, request): + return self._add_domain(request) + def _add_domain(self, obj): - obj.meta[b'domain'] = self.parse_domain_info(obj.url, self.manager.test_mode) + obj.meta[b'domain'] = self._parse_domain_info(obj.url, self.manager.test_mode) if b'redirect_urls' in obj.meta: - obj.meta[b'redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode) + obj.meta[b'redirect_domains'] = [self._parse_domain_info(url, self.manager.test_mode) for url in obj.meta[b'redirect_urls']] return obj - def parse_domain_info(self, url, test_mode=False): + def _parse_domain_info(self, url, test_mode=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = to_bytes(match.groups()[0]) if match else b'?' diff --git a/frontera/contrib/middlewares/fingerprint.py b/frontera/contrib/middlewares/fingerprint.py index bb08fca7c..ca79efc3a 100644 --- a/frontera/contrib/middlewares/fingerprint.py +++ b/frontera/contrib/middlewares/fingerprint.py @@ -41,6 +41,9 @@ def links_extracted(self, request, links): def request_error(self, request, error): return self._add_fingerprint(request) + def create_request(self, request): + return self._add_fingerprint(request) + def _add_fingerprint(self, obj): raise NotImplementedError diff --git a/frontera/core/components.py b/frontera/core/components.py index 39e550931..b9746b58e 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -187,6 +187,15 @@ class Middleware(Component): """Interface definition for a Frontier Middlewares""" component_name = 'Base Middleware' + def create_request(self, request): + """ + Applying middleware logic on newly created request. + + :param reqeust: :class:`Request ` object + :return: an instance of :class:`Request ` object. + """ + pass + @six.add_metaclass(ABCMeta) class CanonicalSolver(Middleware): diff --git a/frontera/core/manager.py b/frontera/core/manager.py index c83bb780b..1aa488ec1 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -81,9 +81,11 @@ def _load_middlewares(self, middleware_names): return mws - def _process_components(self, method_name, obj=None, return_classes=None, **kwargs): + def _process_components(self, method_name, obj=None, return_classes=None, components=None, **kwargs): + pipeline = self._components_pipeline if components is None else \ + [self._components_pipeline[c] for c in components] return_obj = obj - for component_category, component, check_response in self._components_pipeline: + for component_category, component, check_response in pipeline: components = component if isinstance(component, list) else [component] for component in components: result = self._process_component(component=component, method_name=method_name, @@ -245,6 +247,7 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes strategy_worker=strategy_worker) # Init frontier components pipeline + # Some code relies on the order, modify carefully self._components_pipeline = [ ('Middleware', self.middlewares, True), ('CanonicalSolver', self.canonicalsolver, False), @@ -493,6 +496,24 @@ def request_error(self, request, error): error=error) return processed_page + def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): + """ + Creates request and applies middleware and canonical solver pipelines. + + :param url: str + :param method: bytes + :param headers: dict + :param cookies: dict + :param meta: dict + :param body: bytes + :return: :class:`Request ` object + """ + r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) + return self._process_components('create_request', + obj=r, + return_classes=self.request_model, + components=(0,1)) + def _check_startstop(self): assert self._started, "Frontier not started!" assert not self._stopped, "Call to stopped frontier!" diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 1b46f5d96..2c0551f84 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -22,7 +22,7 @@ class BaseCrawlingStrategy(object): def __init__(self, manager, mb_stream, states_context): self._mb_stream = mb_stream self._states_context = states_context - self.url_mw = UrlFingerprintMiddleware(manager) + self._manager = manager @classmethod def from_worker(cls, manager, mb_stream, states_context): @@ -110,9 +110,7 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No :param body: str :return: :class:`Request ` """ - r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) - self.url_mw._add_fingerprint(r) - return r + return self._manager.create_request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) def refresh_states(self, requests): """ From 01b3b73baf54ffb945a3cae1d1e03eedbcced5ab Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 5 Oct 2017 16:49:17 +0200 Subject: [PATCH 124/273] removed domain info logic from hbase.queue, domain mw enabled by default --- frontera/contrib/backends/hbase.py | 7 +------ frontera/settings/default_settings.py | 3 ++- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 0d50a9016..3ea2603b6 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -103,12 +103,7 @@ def schedule(self, batch): now = int(time()) for fprint, score, request, schedule in batch: if schedule: - if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, - # so I propose to require DomainMiddleware by HBaseBackend and remove that code - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) - request.meta[b'domain'] = {'name': hostname} + assert b'domain' in request.meta timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in six.iteritems(to_schedule): diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b7d65ac1c..70318f141 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -29,7 +29,8 @@ MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' MESSAGE_BUS_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' MIDDLEWARES = [ - 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'frontera.contrib.middlewares.fingerprint.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware' ] NEW_BATCH_DELAY = 30.0 DOMAINS_BLACKLIST = None From 2060f0c8ef6c2751eda0d2fe36c8c40fbc2a49a6 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 5 Oct 2017 19:50:40 +0200 Subject: [PATCH 125/273] enabling domain fingerprint mw by default --- frontera/settings/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 70318f141..3a5df4090 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -29,7 +29,7 @@ MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' MESSAGE_BUS_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' MIDDLEWARES = [ - 'frontera.contrib.middlewares.fingerprint.DomainMiddleware', + 'frontera.contrib.middlewares.domain.DomainMiddleware', 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware' ] NEW_BATCH_DELAY = 30.0 From bea71e8374b85d0a6ae1608da2303d7f4d150a7f Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 15:20:31 +0200 Subject: [PATCH 126/273] strategy tests fixed --- frontera/contrib/backends/memory/__init__.py | 57 +++++++++++++++++++- tests/test_strategy.py | 36 ++++++------- 2 files changed, 73 insertions(+), 20 deletions(-) diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index 2e6deb7ff..456807fa9 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -4,7 +4,7 @@ from collections import deque, Iterable from frontera.contrib.backends import CommonBackend -from frontera.core.components import Metadata, Queue, States +from frontera.core.components import Metadata, Queue, States, DistributedBackend from frontera.core import OverusedBuffer from frontera.utils.heap import Heap from frontera.contrib.backends.partitioners import Crc32NamePartitioner @@ -255,6 +255,61 @@ def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) +class MemoryDistributedBackend(DistributedBackend): + def __init__(self, manager): + settings = manager.settings + self._states = MemoryStates(1000) + self._queue = MemoryQueue(settings.get('SPIDER_FEED_PARTITIONS')) + self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') + + def add_seeds(self, seeds): + pass + + def page_crawled(self, response): + pass + + def request_error(self, page, error): + pass + + def finished(self): + pass + + def links_extracted(self, request, links): + pass + + @property + def metadata(self): + return self._metadata + + @property + def queue(self): + return self._queue + + @property + def states(self): + return self._states + + def get_next_requests(self, max_n_requests, **kwargs): + next_pages = [] + partitions = set(kwargs.pop('partitions', [])) + for partition_id in range(0, self.queue_partitions): + if partition_id not in partitions: + continue + results = self.queue.get_next_requests(max_n_requests, partition_id) + next_pages.extend(results) + self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) + return next_pages + + @classmethod + def strategy_worker(cls, manager): + return cls(manager) + + @classmethod + def db_worker(cls, manager): + return cls(manager) + + + BASE = MemoryBaseBackend FIFO = MemoryFIFOBackend LIFO = MemoryLIFOBackend diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 3c6e5dafc..1b39c5756 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- -from frontera.worker.strategies import BaseCrawlingStrategy -from frontera.worker.strategy import StatesContext -from frontera.settings import Settings -from tests.mocks.frontier_manager import FakeFrontierManager +from unittest import TestCase -from frontera.contrib.backends.memory import MemoryStates +from frontera import FrontierManager from frontera.core.components import States +from frontera.settings import Settings +from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.worker.strategy import StatesContext class DummyCrawlingStrategy(BaseCrawlingStrategy): @@ -30,29 +30,27 @@ def flush(self): pass -class TestCrawlingStrategy(object): - def strategy(self): +class TestCrawlingStrategy(TestCase): + def setUp(self): settings = Settings() - manager = FakeFrontierManager(settings) - stream = MessageBusStream() - states = MemoryStates(10) - states_ctx = StatesContext(states) - return DummyCrawlingStrategy.from_worker(manager, stream, states_ctx) + settings.BACKEND = "frontera.contrib.backends.memory.MemoryDistributedBackend" + self.manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) + self.stream = MessageBusStream() + self.states_ctx = StatesContext(self.manager.backend.states) + self.strategy = DummyCrawlingStrategy(self.manager, self.stream, self.states_ctx) def test_create_request(self): - s = self.strategy() - req = s.create_request("http://test.com/someurl") + req = self.strategy.create_request("http://test.com/someurl") assert req.meta[b'fingerprint'] == b'955ac04f1b1a96de60a5139ad90c80be87822159' def test_states_refresh(self): - s = self.strategy() - states = s._states_context._states + states = self.manager.backend.states url = "http://test.com/someurl" - req1 = s.create_request(url) + req1 = self.strategy.create_request(url) req1.meta[b'state'] = States.CRAWLED states.update_cache(req1) - req2 = s.create_request(url) - s.refresh_states([req2]) + req2 = self.strategy.create_request(url) + self.strategy.refresh_states([req2]) assert req2.meta[b'state'] == req1.meta[b'state'] assert req2.meta[b'state'] == States.CRAWLED From c6e074fbe4b9a16b9e433f3db752b6569422c233 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 15:44:35 +0200 Subject: [PATCH 127/273] using unittest --- tests/test_worker_strategy.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 7a2acd873..d7adbf95a 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -3,7 +3,7 @@ from frontera.settings import Settings from frontera.core.models import Request, Response from frontera.core.components import States - +from unittest import TestCase r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0}) r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0}) @@ -11,17 +11,17 @@ r4 = Request('http://www.test.com/some/page', meta={b'fingerprint': b'4', b'jid': 0}) -class TestStrategyWorker(object): +class TestStrategyWorker(TestCase): - def sw_setup(self): + def setUp(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - return StrategyWorker(settings, CrawlingStrategy) + self.sw = StrategyWorker(settings, CrawlingStrategy) def test_add_seeds(self): - sw = self.sw_setup() + sw = self.sw msg = sw._encoder.encode_add_seeds([r1, r2, r3, r4]) sw.consumer.put_messages([msg]) r2.meta[b'state'] = States.CRAWLED @@ -36,7 +36,7 @@ def test_add_seeds(self): for r in [r1, r3, r4]]) def test_page_crawled(self): - sw = self.sw_setup() + sw = self.sw r1.meta[b'jid'] = 1 resp = Response(r1.url, request=r1) msg = sw._encoder.encode_page_crawled(resp) @@ -52,7 +52,7 @@ def test_page_crawled(self): assert r1c.meta[b'state'] == States.CRAWLED def test_links_extracted(self): - sw = self.sw_setup() + sw = self.sw sw.job_id = 0 r1.meta[b'jid'] = 0 msg = sw._encoder.encode_links_extracted(r1, [r3, r4]) @@ -64,7 +64,7 @@ def test_links_extracted(self): set(sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) for r in [r3, r4]) def test_request_error(self): - sw = self.sw_setup() + sw = self.sw msg = sw._encoder.encode_request_error(r4, 'error') sw.consumer.put_messages([msg]) sw.work() From 0bd0255c1f468ce633290c9c5870f804e95b8578 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 15:49:33 +0200 Subject: [PATCH 128/273] tests fix --- frontera/worker/strategy.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 0d2a2ce12..26a3435d9 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -29,21 +29,18 @@ class UpdateScoreStream(object): - def __init__(self, encoder, scoring_log_producer, size): + + def __init__(self, producer, encoder): + self._producer = producer self._encoder = encoder - self._buffer = [] - self._producer = scoring_log_producer - self._size = size def send(self, request, score=1.0, dont_queue=False): encoded = self._encoder.encode_update_score( - request, - score, - not dont_queue + request=request, + score=score, + schedule=not dont_queue ) - self._buffer.append(encoded) - if len(self._buffer) > self._size: - self.flush() + self._producer.send(None, encoded) def flush(self): pass @@ -104,7 +101,7 @@ def __init__(self, settings, strategy_class): self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) self._encoder = encoder_cls(self._manager.request_model) - self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) + self.update_score = UpdateScoreStream(self.scoring_log_producer, self._encoder) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') From 389396a770fb2e372e010a8bd550aae19c70de7b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 15:51:09 +0200 Subject: [PATCH 129/273] speeding up serialization --- .../contrib/backends/remote/codecs/msgpack.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 8fe3921a5..fa700ce92 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -13,26 +13,27 @@ logger = logging.getLogger(__name__) +def _serialize(obj): + """Recursively walk object's hierarchy.""" + if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj is None: + return obj + elif isinstance(obj, dict): + obj = obj.copy() + for key in obj: + obj[key] = _serialize(obj[key]) + return obj + elif isinstance(obj, list): + return [_serialize(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(_serialize([item for item in obj])) + elif hasattr(obj, '__dict__'): + return _serialize(obj.__dict__) + else: + logger.warning('unable to serialize object: {}'.format(obj)) + return None + def _prepare_request_message(request): - def serialize(obj): - """Recursively walk object's hierarchy.""" - if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj is None: - return obj - elif isinstance(obj, dict): - obj = obj.copy() - for key in obj: - obj[key] = serialize(obj[key]) - return obj - elif isinstance(obj, list): - return [serialize(item) for item in obj] - elif isinstance(obj, tuple): - return tuple(serialize([item for item in obj])) - elif hasattr(obj, '__dict__'): - return serialize(obj.__dict__) - else: - logger.warning('unable to serialize object: {}'.format(obj)) - return None - return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] + return [request.url, request.method, request.headers, request.cookies, _serialize(request.meta)] def _prepare_response_message(response, send_body): From 6773404bf88353edbe92e94e0148486d89510384 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 15:59:42 +0200 Subject: [PATCH 130/273] test case --- tests/test_worker_strategy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 7a2acd873..35179f91e 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -11,6 +11,11 @@ r4 = Request('http://www.test.com/some/page', meta={b'fingerprint': b'4', b'jid': 0}) +class FilteredLinksCrawlingStrategy(CrawlingStrategy): + def filter_extracted_links(self, request, links): + return [] + + class TestStrategyWorker(object): def sw_setup(self): @@ -20,6 +25,13 @@ def sw_setup(self): settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 return StrategyWorker(settings, CrawlingStrategy) + def sw_setup_filtered_links(self): + settings = Settings() + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 + return StrategyWorker(settings, FilteredLinksCrawlingStrategy) + def test_add_seeds(self): sw = self.sw_setup() msg = sw._encoder.encode_add_seeds([r1, r2, r3, r4]) @@ -63,6 +75,17 @@ def test_links_extracted(self): assert set(sw.scoring_log_producer.messages) == \ set(sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) for r in [r3, r4]) + def test_filter_links_extracted(self): + sw = self.sw_setup_filtered_links() + sw.job_id = 0 + r1.meta[b'jid'] = 0 + msg = sw._encoder.encode_links_extracted(r1, [r3, r4]) + sw.consumer.put_messages([msg]) + sw.work() + r3.meta[b'state'] = States.QUEUED + r4.meta[b'state'] = States.QUEUED + assert set(sw.scoring_log_producer.messages) == set() + def test_request_error(self): sw = self.sw_setup() msg = sw._encoder.encode_request_error(r4, 'error') From 9d6ad059fa0877fca4fc432b540ef7f6b730e917 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 24 Nov 2017 15:49:14 +0100 Subject: [PATCH 131/273] removed ambiguous redirect logic --- frontera/contrib/scrapy/schedulers/frontier.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index 75592c192..a9ca35d52 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -89,14 +89,11 @@ def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): - if not self._request_is_redirected(request): - self.frontier.add_seeds([request]) - self.stats_manager.add_seeds() - return True - elif self.redirect_enabled: + if self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True + self.logger.warning("The enqueue_request failed on %s", request.url) return False def next_request(self): @@ -165,9 +162,6 @@ def _get_exception_code(self, exception): except Exception: return '?' - def _request_is_redirected(self, request): - return request.meta.get(b'redirect_times', 0) > 0 - def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { From a821652c3a2047f858fdc60cafb6c5702a7096f1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 25 Dec 2017 12:42:33 +0100 Subject: [PATCH 132/273] retries budget added in offsets fetch --- frontera/contrib/messagebus/kafka/async.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index 8e3036bf9..158e3a0e9 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -162,9 +162,9 @@ def offsets(self, partitions, timestamp): Returns: dict: TopicPartition and message offsets """ - while True: + retries = 3 + while retries > 0: offsets = {} - ok = True for future in self._send_offset_request(partitions, timestamp): self._client.poll(future=future) @@ -179,11 +179,13 @@ def offsets(self, partitions, timestamp): if future.exception.invalid_metadata: refresh_future = self._client.cluster.request_update() self._client.poll(future=refresh_future, sleep=True) - ok = False - log.warning("Got exception %s and kept the loop.", future.exception) - break - if ok: + log.warning("Got exception %s and kept the loop", future.exception) + if offsets: return offsets + retries -= 1 + log.warning("Retrying the offsets fetch loop (%d retries left)", retries) + log.error("Unsuccessful offsets retrieval") + return {} def _send_offset_request(self, partitions, timestamp): """Fetch a single offset before the given timestamp for the partition. From a6057e9fd9b7567b9bfe97f162846ae15c079db9 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 30 Jan 2018 08:50:29 +0100 Subject: [PATCH 133/273] fix of partitioning in batch-gen --- frontera/worker/components/batch_generator.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 82a2bb26f..06d6f18cc 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -106,10 +106,4 @@ def get_fingerprint(self, request): return request.meta[b'fingerprint'] def get_hostname(self, request): - try: - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - except Exception as e: - self.logger.error("URL parsing error %s, fingerprint %s, url %s" % - (e, request.meta[b'fingerprint'], request.url)) - else: - return hostname.encode('utf-8', 'ignore') + return request.meta[b'domain'][b'name'] From 8d6e7b17bb482365dd9fe34ccef4db9eb518662e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 7 Feb 2018 12:37:58 +0100 Subject: [PATCH 134/273] domain stats in batch gen --- docs/source/topics/frontera-settings.rst | 11 ++++++ frontera/settings/default_settings.py | 1 + frontera/worker/components/batch_generator.py | 35 +++++++++++++++---- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index d6afe815b..8131bb885 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -173,6 +173,17 @@ Delay between calls to backend for new batches in Scrapy scheduler, when queue s without hitting backend on every request. Increase it if calls to your backend is taking too long, and decrease if you need a fast spider bootstrap from seeds. + +.. setting:: DOMAIN_STATS_LOG_INTERVAL + +DOMAIN_STATS_LOG_INTERVAL +------------------------- + +Default: ``300`` + +Time interval in seconds to rotate the domain statistics in :term:`db worker` batch generator. Enabled only when +logging set to DEBUG. + .. setting:: KAFKA_GET_TIMEOUT KAFKA_GET_TIMEOUT diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 3a5df4090..a37a44204 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -10,6 +10,7 @@ CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +DOMAIN_STATS_LOG_INTERVAL = 300 HBASE_THRIFT_HOST = 'localhost' HBASE_THRIFT_PORT = 9090 diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 06d6f18cc..43f09e122 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -2,9 +2,9 @@ from __future__ import absolute_import import threading -from time import asctime - -from six.moves import map +from time import asctime, time +from collections import defaultdict +from logging import DEBUG from frontera.exceptions import NotConfigured from frontera.utils.url import parse_domain_from_url_fast @@ -37,6 +37,12 @@ def __init__(self, worker, settings, stop_event, # create an event to disable/enable batches generation via RPC self.disabled_event = threading.Event() + # domain statistics logging + self.domain_stats = dict([(partition_id, defaultdict(int)) for partition_id in self.partitions]) + self.domain_stats_interval = settings.get('DOMAIN_STATS_LOG_INTERVAL') + self.rotate_time = time() + + def get_ready_partitions(self): pending_partitions = self.spider_feed.available_partitions() if not self.partitions: @@ -46,6 +52,9 @@ def get_ready_partitions(self): def run(self): if self.disabled_event.is_set(): return True + if self.logger.isEnabledFor(DEBUG) and time() > self.rotate_time: + self.rotate_and_log_domain_stats() + partitions = self.get_ready_partitions() if not partitions: return True @@ -75,11 +84,14 @@ def _handle_partition(self, partition_id): continue try: self.spider_feed_producer.send(self.get_key_function(request), eo) - except Exception as exc: - self.logger.error("Sending message error, %s, fingerprint: %s, url: %s" % - (exc, self.get_fingerprint(request), request.url)) + except Exception: + self.logger.exception("Sending message error fingerprint: %s, url: %s" % + (self.get_fingerprint(request), request.url)) finally: count += 1 + hostname = self.get_hostname(request) + if self.logger.isEnabledFor(DEBUG): + self.domain_stats[partition_id][hostname] += 1 self.update_stats(increments={'pushed_since_start': count}) return count @@ -100,6 +112,17 @@ def _is_domain_blacklisted(self, request): def close(self): self.spider_feed_producer.close() + def rotate_and_log_domain_stats(self): + self.logger.debug("Domain statistics of requests pushed to spider feed") + for partition_id, host_stats in self.domain_stats: + self.logger.debug("PID %d =================================================================", partition_id) + for hostname, count in host_stats.items(): + self.logger.debug("%s\t%d", hostname, count) + + self.domain_stats[partition_id] = defaultdict(int) + self.rotate_time = time() + self.domain_stats_interval + + # --------------------------- Auxiliary tools -------------------------------- def get_fingerprint(self, request): From 88cdd462b31c853520977937dc96e9a33804550d Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 7 Feb 2018 14:05:01 +0100 Subject: [PATCH 135/273] fixes --- frontera/worker/components/batch_generator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 43f09e122..297f7cbc6 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -40,8 +40,7 @@ def __init__(self, worker, settings, stop_event, # domain statistics logging self.domain_stats = dict([(partition_id, defaultdict(int)) for partition_id in self.partitions]) self.domain_stats_interval = settings.get('DOMAIN_STATS_LOG_INTERVAL') - self.rotate_time = time() - + self.rotate_time = time() + self.domain_stats_interval def get_ready_partitions(self): pending_partitions = self.spider_feed.available_partitions() @@ -114,7 +113,7 @@ def close(self): def rotate_and_log_domain_stats(self): self.logger.debug("Domain statistics of requests pushed to spider feed") - for partition_id, host_stats in self.domain_stats: + for partition_id, host_stats in sorted(self.domain_stats.items(), key=lambda x: x[1]): self.logger.debug("PID %d =================================================================", partition_id) for hostname, count in host_stats.items(): self.logger.debug("%s\t%d", hostname, count) From a868aeb5c639701da84ab67566933273f1fa8b2a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 12 Feb 2018 19:39:33 +0100 Subject: [PATCH 136/273] hbase refactor, first DomainCache with second gen draft --- .../backends/{hbase.py => hbase/__init__.py} | 1 - .../contrib/backends/hbase/domaincache.py | 289 ++++++++++++++++++ frontera/contrib/backends/hbase/utils.py | 22 ++ .../contrib/backends/remote/codecs/msgpack.py | 35 +-- frontera/utils/msgpack.py | 22 ++ tests/test_domain_cache.py | 106 +++++++ 6 files changed, 446 insertions(+), 29 deletions(-) rename frontera/contrib/backends/{hbase.py => hbase/__init__.py} (99%) create mode 100644 frontera/contrib/backends/hbase/domaincache.py create mode 100644 frontera/contrib/backends/hbase/utils.py create mode 100644 frontera/utils/msgpack.py create mode 100644 tests/test_domain_cache.py diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase/__init__.py similarity index 99% rename from frontera/contrib/backends/hbase.py rename to frontera/contrib/backends/hbase/__init__.py index d719457e9..fbc030c20 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, division -from frontera.utils.url import parse_domain_from_url_fast from frontera import DistributedBackend from frontera.core.components import Metadata, Queue, States from frontera.core.models import Request diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py new file mode 100644 index 000000000..73639fb37 --- /dev/null +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -0,0 +1,289 @@ +from __future__ import absolute_import + +import logging +from collections import defaultdict +from time import time + +import six +from frontera.contrib.backends.hbase.utils import HardenedBatch +from frontera.utils.msgpack import restruct_for_pack +from happybase.batch import DEFAULT_HBASE_THRIFT_FRAME_SIZE +from msgpack import packb, unpackb +from w3lib.util import to_bytes + +DOMAIN_CACHE_BATCH_SIZE = 100 + +import collections +from cachetools import Cache + + +class LRUCache(Cache): + """Least Recently Used (LRU) cache implementation.""" + + def __init__(self, maxsize, missing=None, getsizeof=None): + Cache.__init__(self, maxsize, missing, getsizeof) + self.__order = collections.OrderedDict() + + def __getitem__(self, key, cache_getitem=Cache.__getitem__): + value = cache_getitem(self, key) + self._update_order(key) + return value + + def __setitem__(self, key, value, cache_setitem=Cache.__setitem__): + cache_setitem(self, key, value) + self._update_order(key) + + def __delitem__(self, key, cache_delitem=Cache.__delitem__): + cache_delitem(self, key) + del self.__order[key] + + def popitem(self): + """Remove and return the `(key, value)` pair least recently used.""" + try: + key = next(iter(self.__order)) + except StopIteration: + raise KeyError('%s is empty' % self.__class__.__name__) + else: + return (key, self.pop(key)) + + if hasattr(collections.OrderedDict, 'move_to_end'): + def _update_order(self, key): + try: + self.__order.move_to_end(key) + except KeyError: + self.__order[key] = None + else: + def _update_order(self, key): + try: + self.__order[key] = self.__order.pop(key) + except KeyError: + self.__order[key] = None + + +class DomainCache(LRUCache): + + MAX_VALUE_SIZE = int(DEFAULT_HBASE_THRIFT_FRAME_SIZE * 0.95) + LOG_INTERVAL = 60.0 + + def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None): + super(DomainCache, self).__init__(maxsize) + + self._second_gen = dict() + + table_name = to_bytes(table_name) + self._table = self._get_domain_table(connection, table_name) + self._batch = HardenedBatch(self._table, batch_size=DOMAIN_CACHE_BATCH_SIZE) + self._set_fields = set(set_fields) if set_fields else set() + self._on_get_func = on_get_func + + self.logger = logging.getLogger("domain-cache") + self.stats = defaultdict(int) + self.next_log = time() + self.LOG_INTERVAL + + # Primary methods + + def __setitem__(self, key, value): + self._key_check(key) + assert isinstance(value, dict) + super(DomainCache, self).__setitem__(key, value) + + def __getitem__(self, key): + self._key_check(key) + try: + value = Cache.__getitem__(self, key) + except KeyError as ke1: + try: + value = self._second_gen[key] + except KeyError as ke2: + try: + value = self._get_item(key) + except KeyError as ke3: + raise ke3 + else: + self.__setitem__(key, value) + else: + self.__setitem__(key, value) + del self._second_gen[key] + else: + self._update_order(key) + return value + + def __delitem__(self, key): + self._key_check(key) + not_found = True + if super(DomainCache, self).__contains__(key): + super(DomainCache, self).__delitem__(key) + not_found = False + if key in self._second_gen: + del self._second_gen[key] + not_found = False + rk = to_bytes(key) + if self._table.row(rk): + self._table.delete(rk) + not_found = False + if not_found: + raise KeyError + + def __contains__(self, key): + self._key_check(key) + self.stats["contains"] += 1 + if super(DomainCache, self).__contains__(key): + self.stats["contains_in_memory"] += 1 + return True + if key in self._second_gen: + self.stats["contains_in_secgen"] += 1 + return True + if self._table.row(to_bytes(key)): + self.stats["contains_in_hbase"] += 1 + return True + self.stats["contains_false"] += 1 + return False + + def popitem(self): + """ + Called every time item is evicted by LRU cache + """ + key, value = super(DomainCache, self).popitem() + self._second_gen[key] = value + self.stats["pops"] += 1 + if len(self._second_gen) >= DOMAIN_CACHE_BATCH_SIZE: + self._flush_second_gen() + self._second_gen.clear() + self.stats["flushes"]+=1 + + # These methods aren't meant to be implemented + + def __missing__(self, key): + raise KeyError + + __len__ = None + + __iter__ = None + + clear = None + + maxsize = None + + # Secondary methods, those that are depend on primary + + def get(self, key, default=None): + """ + HBase-optimized get + """ + self._key_check(key) + self._log_and_rotate_stats() + if super(DomainCache, self).__contains__(key) or key in self._second_gen: + self.stats["gets_memory_hit"] += 1 + return self[key] + try: + value = self._get_item(key) + except KeyError: + self.stats["gets_miss"] += 1 + return default + else: + self.stats["gets_hbase_hit"] += 1 + return value + + def setdefault(self, key, default=None): + """ + HBase-optimized setdefault + """ + self._key_check(key) + self.stats["gets"]+=1 + self._log_and_rotate_stats() + if super(DomainCache, self).__contains__(key) or key in self._second_gen: + value = self[key] + self.stats["gets_memory_hit"] += 1 + else: + try: + value = self._get_item(key) + except KeyError: + self.stats["gets_miss"] += 1 + value = default + else: + self.stats["gets_hbase_hit"] += 1 + self[key] = value + return value + + def flush(self): + for k, v in super(DomainCache, self).__iter__(): + try: + self._store_item_batch(k, v) + except Exception: + self.logger.exception("Error storing kv pair %s, %s", k, v) + pass + self._flush_second_gen() + self._batch.send() + + # private + + def _flush_second_gen(self): + for key, value in six.iteritems(self._second_gen): + self._store_item_batch(key, value) + self._batch.send() + + def _log_and_rotate_stats(self): + if not self.logger.isEnabledFor(logging.DEBUG): + return + if time() > self.next_log: + for k, v in self.stats.items(): + self.logger.debug("%s = %d", k, v) + self.next_log = time() + self.LOG_INTERVAL + self.stats = defaultdict(int) + + def _get_domain_table(self, connection, table_name): + tables = set(connection.tables()) + if table_name not in tables: + schema = {'m': {'max_versions': 1}} + connection.create_table(table_name, schema) + return connection.table(table_name) + + def _get_item(self, key): + self.stats["hbase_gets"]+=1 + hbase_key = to_bytes(key) + row = self._table.row(hbase_key) + if not row: + self.stats["hbase_misses"] += 1 + super(DomainCache, self).__missing__(key) + raise KeyError + value = {} + for k, v in six.iteritems(row): + cf, _, col = k.partition(':') + value[col] = unpackb(v, encoding='utf-8') + # XXX extract some fields as a set for faster in-checks + if col in self._set_fields: + value[col] = set(value[col]) + if self._on_get_func: + self._on_get_func(value) + return value + + def _store_item_batch(self, key, value): + data = {} + self._key_check(key) + for k, v in six.iteritems(value): + if k.startswith('_'): + continue + # convert set to list manually for successful serialization + v = restruct_for_pack(v) + k = to_bytes(k) + data[b"m:%s" % k] = packb(v, use_bin_type=True) + tries = 3 + while data and tries > 0: + try: + self._batch.put(key, data) + except ValueError: + self.logger.exception("Exception happened during item storing, %d tries left", tries) + data_lengths = dict((k, len(v)) for k, v in six.iteritems(data)) + self.logger.info("RK %s per-column lengths %s", key, str(data_lengths)) + for k ,length in data_lengths.items(): + if length > self.MAX_VALUE_SIZE: + self.logger.info("Dropping key %s", k) + del data[k] + tries -= 1 + continue + else: + break + + def _key_check(self, key): + if len(key) == 0 or len(key) > 255: + raise KeyError("Key cannot be empty or longer than 255 chars") \ No newline at end of file diff --git a/frontera/contrib/backends/hbase/utils.py b/frontera/contrib/backends/hbase/utils.py new file mode 100644 index 000000000..0f58a55cb --- /dev/null +++ b/frontera/contrib/backends/hbase/utils.py @@ -0,0 +1,22 @@ +from __future__ import absolute_import +from happybase import Batch + +from thriftpy.transport import TTransportException +import logging + + +class HardenedBatch(Batch): + def __init__(self, table, timestamp=None, batch_size=None, + transaction=False, wal=True): + super(HardenedBatch, self).__init__(table, timestamp=timestamp, batch_size=batch_size, transaction=transaction, + wal=wal) + self.logger = logging.getLogger("happybase.batch") + + def send(self): + try: + super(HardenedBatch, self).send() + except TTransportException: + self.logger.exception("Exception happened during batch persistence") + self.logger.warning("Cleaning up the batch") + self._reset_mutations() + pass \ No newline at end of file diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index fa700ce92..a5f3129d6 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -2,38 +2,15 @@ """ A MsgPack codec for Frontera. Implemented using native msgpack-python library. """ from __future__ import absolute_import -import logging -from msgpack import packb, unpackb from frontera.core.codec import BaseDecoder, BaseEncoder -import six +from frontera.utils.msgpack import restruct_for_pack +from msgpack import packb, unpackb from w3lib.util import to_native_str -logger = logging.getLogger(__name__) - - -def _serialize(obj): - """Recursively walk object's hierarchy.""" - if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)) or obj is None: - return obj - elif isinstance(obj, dict): - obj = obj.copy() - for key in obj: - obj[key] = _serialize(obj[key]) - return obj - elif isinstance(obj, list): - return [_serialize(item) for item in obj] - elif isinstance(obj, tuple): - return tuple(_serialize([item for item in obj])) - elif hasattr(obj, '__dict__'): - return _serialize(obj.__dict__) - else: - logger.warning('unable to serialize object: {}'.format(obj)) - return None - def _prepare_request_message(request): - return [request.url, request.method, request.headers, request.cookies, _serialize(request.meta)] + return [request.url, request.method, request.headers, request.cookies, restruct_for_pack(request.meta)] def _prepare_response_message(response, send_body): @@ -115,7 +92,9 @@ def decode(self, buffer): return ('offset', int(obj[1]), int(obj[2])) if obj[0] == b'st': return ('stats', obj[1]) - raise TypeError('Unknown message type') + return TypeError('Unknown message type') def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer, encoding='utf-8')) + return self._request_from_object(unpackb(buffer)) + + diff --git a/frontera/utils/msgpack.py b/frontera/utils/msgpack.py new file mode 100644 index 000000000..8d8549a87 --- /dev/null +++ b/frontera/utils/msgpack.py @@ -0,0 +1,22 @@ +import six + + +def restruct_for_pack(obj): + """Recursively walk object's hierarchy.""" + if isinstance(obj, six.text_type): + return obj.encode('utf8') + if isinstance(obj, (bool, six.integer_types, float, six.binary_type)): + return obj + elif isinstance(obj, dict): + obj = obj.copy() + for key in obj: + obj[key] = restruct_for_pack(obj[key]) + return obj + elif isinstance(obj, list): + return [restruct_for_pack(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(restruct_for_pack([item for item in obj])) + elif hasattr(obj, '__dict__'): + return restruct_for_pack(obj.__dict__) + else: + return None \ No newline at end of file diff --git a/tests/test_domain_cache.py b/tests/test_domain_cache.py new file mode 100644 index 000000000..e4f60c693 --- /dev/null +++ b/tests/test_domain_cache.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +from frontera.contrib.backends.hbase.domaincache import DomainCache +from happybase import Connection +import logging +import unittest + + +class TestDomainCache(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + self.conn = Connection(host="hbase-docker", table_prefix="contacts", table_prefix_separator=":") + t = self.conn.table('domain_metadata') + t.delete('d1') + t.delete('d2') + t.delete('d3') + t.delete('d4') + + def test_domain_cache_both_generations(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + + # eviction should happen + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc['d1'] == {'domain': 1} + assert dc['d2'] == {'domain': 2} + assert dc['d3'] == {'domain': [3, 2, 1]} + assert dc['d4'] == {'domain': 4} + + def test_domain_cache_get_with_default(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.get('d1', {}) == {'domain': 1} + assert dc.get('d3', {}) == {'domain': [3, 2, 1]} + + def test_domain_cache_setdefault(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.setdefault('d1', {}) == {'domain': 1} + assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6} + dc.flush() + assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]} + + def test_empty_key(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + with self.assertRaises(KeyError): + dc[''] = {'test':1} + + def test_deletion(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + with self.assertRaises(KeyError): + del dc['d1'] + + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + del dc['d1'] # second gen + del dc['d3'] # first gen + + dc.flush() + + del dc['d4'] # hbase + + def test_contains(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert 'd1' in dc # second gen + assert 'd3' in dc # first gen + + dc.flush() + + assert 'd4' in dc + + def test_pop(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.pop('d1') == {'domain': 1} + assert 'd1' not in dc + + assert dc.pop('d3') == {'domain': [3, 2, 1]} + assert 'd3' not in dc + + dc.flush() + + assert dc.pop('d4') == {'domain': 4} + assert 'd4' not in dc \ No newline at end of file From 6590a1c65bfbc8760a7387913ee4ee56633cea50 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 13 Feb 2018 11:04:00 +0100 Subject: [PATCH 137/273] converting keys to native str --- frontera/contrib/backends/hbase/domaincache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 73639fb37..0bde9e09d 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -9,7 +9,7 @@ from frontera.utils.msgpack import restruct_for_pack from happybase.batch import DEFAULT_HBASE_THRIFT_FRAME_SIZE from msgpack import packb, unpackb -from w3lib.util import to_bytes +from w3lib.util import to_bytes, to_native_str DOMAIN_CACHE_BATCH_SIZE = 100 @@ -249,6 +249,7 @@ def _get_item(self, key): value = {} for k, v in six.iteritems(row): cf, _, col = k.partition(':') + col = to_native_str(col) value[col] = unpackb(v, encoding='utf-8') # XXX extract some fields as a set for faster in-checks if col in self._set_fields: From 5e4f0866bf801b61b739e1379ef7a335bdff8382 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 13 Feb 2018 11:52:29 +0100 Subject: [PATCH 138/273] set serialization fix --- frontera/utils/msgpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/utils/msgpack.py b/frontera/utils/msgpack.py index 8d8549a87..60467bfba 100644 --- a/frontera/utils/msgpack.py +++ b/frontera/utils/msgpack.py @@ -12,7 +12,7 @@ def restruct_for_pack(obj): for key in obj: obj[key] = restruct_for_pack(obj[key]) return obj - elif isinstance(obj, list): + elif isinstance(obj, list) or isinstance(obj, set): return [restruct_for_pack(item) for item in obj] elif isinstance(obj, tuple): return tuple(restruct_for_pack([item for item in obj])) From 0542494ac725d4cdd0d2d6c5bdae0bf998c3d214 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 13 Feb 2018 12:37:46 +0100 Subject: [PATCH 139/273] flush fix and dosctring --- .../contrib/backends/hbase/domaincache.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 0bde9e09d..a928f17cd 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -61,6 +61,24 @@ def _update_order(self, key): class DomainCache(LRUCache): + """ + This is an implementation of Domain metadata cache backed by HBase table. It's main purpose is to store the domain + metadata in Python-friendly structures while providing fast and reliable access. + The container has these features: + * LRU logic, + * two generations, second generation is used for evicted items when HBase batch isn't full, + * batched HBase writes, + * Python 3 and PyPy ready. + + This container has these limitations: + 1. value is always of dict type + 2. data in value cannot be bigger than MAX_VALUE_SIZE (which is usually ~2Mb), Otherwise fields will be dropped + with error message + 3. 255 > len(key) > 0 + 4. key and keys within value dict are always of native string type + 5. all keys are utf-8 strings. + 6. iterator of this container iterates only on first generation content. + """ MAX_VALUE_SIZE = int(DEFAULT_HBASE_THRIFT_FRAME_SIZE * 0.95) LOG_INTERVAL = 60.0 @@ -158,8 +176,6 @@ def __missing__(self, key): __len__ = None - __iter__ = None - clear = None maxsize = None @@ -206,7 +222,7 @@ def setdefault(self, key, default=None): return value def flush(self): - for k, v in super(DomainCache, self).__iter__(): + for k, v in six.iteritems(self): try: self._store_item_batch(k, v) except Exception: @@ -248,7 +264,7 @@ def _get_item(self, key): raise KeyError value = {} for k, v in six.iteritems(row): - cf, _, col = k.partition(':') + cf, _, col = k.partition(b':') col = to_native_str(col) value[col] = unpackb(v, encoding='utf-8') # XXX extract some fields as a set for faster in-checks From c12f0218a1a8eec92c76ef2ecb6f20e971249f41 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 16 Feb 2018 10:31:41 +0100 Subject: [PATCH 140/273] crash fix for py3 --- frontera/worker/components/batch_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 297f7cbc6..94763797d 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -113,7 +113,7 @@ def close(self): def rotate_and_log_domain_stats(self): self.logger.debug("Domain statistics of requests pushed to spider feed") - for partition_id, host_stats in sorted(self.domain_stats.items(), key=lambda x: x[1]): + for partition_id, host_stats in sorted(self.domain_stats.items(), key=lambda x: x[0]): self.logger.debug("PID %d =================================================================", partition_id) for hostname, count in host_stats.items(): self.logger.debug("%s\t%d", hostname, count) From 1ed0be6f08c5862c950c5c9705fbc35d2143263b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 16 Feb 2018 10:35:16 +0100 Subject: [PATCH 141/273] improved error output --- frontera/worker/components/__init__.py | 2 +- frontera/worker/components/scoring_consumer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index f71cb2b8f..c33129150 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -51,7 +51,7 @@ def run_and_reschedule(self): self.periodic_task.schedule() def run_errback(self, failure): - self.logger.exception(failure.value) + self.logger.error(failure.getTraceback()) if not self.stopped: self.periodic_task.schedule() diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py index 4ccb6b0a9..eb40e7713 100644 --- a/frontera/worker/components/scoring_consumer.py +++ b/frontera/worker/components/scoring_consumer.py @@ -32,7 +32,7 @@ def run(self): try: msg = self.worker._decoder.decode(m) except (KeyError, TypeError) as e: - self.logger.error("Decoding error: %s", e) + self.logger.exception("Decoding error") continue else: if msg[0] == 'update_score': From 1a627d16c3f852fbe6682107686b87e64d44ea85 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 16 Mar 2018 01:34:50 +0100 Subject: [PATCH 142/273] add seeds run mode # Conflicts: # frontera/worker/strategy.py --- frontera/worker/strategies/__init__.py | 7 +-- frontera/worker/strategy.py | 75 ++++++++++++++++++++------ 2 files changed, 64 insertions(+), 18 deletions(-) diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 1a9cf4635..ce089f83a 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -19,21 +19,22 @@ class BaseCrawlingStrategy(object): After exiting from all of these methods states from meta field are passed back and stored in the backend. """ - def __init__(self, manager, mb_stream, states_context): + def __init__(self, manager, args, mb_stream, states_context): self._mb_stream = mb_stream self._states_context = states_context self._manager = manager @classmethod - def from_worker(cls, manager, mb_stream, states_context): + def from_worker(cls, manager, args, mb_stream, states_context): """ Called on instantiation in strategy worker. :param manager: :class: `Backend ` instance + :param args: dict with command line arguments from :term:`strategy worker` :param mb_stream: :class: `UpdateScoreStream ` instance :return: new instance """ - return cls(manager, mb_stream, states_context) + return cls(manager, args, mb_stream, states_context) @abstractmethod def add_seeds(self, seeds): diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 48f9f9743..2bb584290 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -23,6 +23,8 @@ from collections import Iterable from binascii import hexlify import six +from six.moves.urllib.parse import urlparse +from six.moves.urllib.request import urlopen logger = logging.getLogger("strategy-worker") @@ -82,16 +84,19 @@ def flush(self): class BaseStrategyWorker(object): """Base strategy worker class.""" - def __init__(self, settings, strategy_class): + def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) - spider_log = mb.spider_log() scoring_log = mb.scoring_log() - self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') + self.add_seeds_mode = is_add_seeds_mode + if not self.add_seeds_mode: + spider_log = mb.spider_log() + self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') + self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) @@ -103,7 +108,6 @@ def __init__(self, settings, strategy_class): self.update_score = UpdateScoreStream(self.scoring_log_producer, self._encoder) self.states_context = StatesContext(self._manager.backend.states) - self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) self.states = self._manager.backend.states @@ -233,7 +237,28 @@ def work(self): self.stats['last_consumption_run'] = asctime() self.stats['consumed_since_start'] += consumed - def run(self): + def add_seeds(self, seeds_url): + logger.info("Seeds addition started from url %s", seeds_url) + if not seeds_url: + self.strategy.add_seeds(None) + else: + parsed = urlparse(seeds_url) + if parsed.scheme == "s3": + import boto3 + s3 = boto3.resource("s3") + obj = s3.Object(parsed.hostname, parsed.path) + response = obj.get() + fh = response['Body'] + else: + fh = urlopen(seeds_url) + from io import BufferedReader + buffered_stream = BufferedReader(fh) + self.strategy.add_seeds(buffered_stream) + buffered_stream.close() + self.update_score.flush() + self.states_context.release() + + def run(self, seeds_url): def log_failure(failure): logger.exception(failure.value) if failure.frames: @@ -256,12 +281,16 @@ def debug(sig, frame): logger.critical(str("").join(format_stack(frame))) install_shutdown_handlers(self._handle_shutdown) - self.task.start(interval=0).addErrback(errback_main) - self._logging_task.start(interval=30) - # run flushing states LoopingCall with random delay - flush_states_task_delay = randint(0, self._flush_interval) - logger.info("Starting flush-states task in %d seconds", flush_states_task_delay) - task.deferLater(reactor, flush_states_task_delay, run_flush_states_task) + if self.add_seeds_mode: + self.add_seeds(seeds_url) + else: + self.task.start(interval=0).addErrback(errback_main) + self._logging_task.start(interval=30) + # run flushing states LoopingCall with random delay + flush_states_task_delay = randint(0, self._flush_interval) + logger.info("Starting flush-states task in %d seconds", flush_states_task_delay) + task.deferLater(reactor, flush_states_task_delay, run_flush_states_task) + signal(SIGUSR1, debug) reactor.run(installSignalHandlers=False) @@ -363,6 +392,13 @@ def setup_environment(): help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") + parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") + parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " + "in a form of key=value separated with space") + parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " + "of strategy add_seeds method") + parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " + "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') @@ -377,6 +413,12 @@ def setup_environment(): partition_id) settings.set('SCORING_PARTITION_ID', partition_id) + strategy_args = {} + if args.args: + for arg in args.args: + key, _, value = arg.partition("=") + strategy_args[key] = value if value else None + logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): fileConfig(logging_config_path, disable_existing_loggers=False) @@ -384,10 +426,13 @@ def setup_environment(): logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - return settings, strategy_class + + return settings, strategy_class, args.add_seeds, strategy_args, args.seeds_url if __name__ == '__main__': - settings, strategy_class = setup_environment() - worker = StrategyWorker(settings, strategy_class) - worker.run() + settings, strategy_class, is_add_seeds_mode, strategy_args, seeds_url = setup_environment() + worker = StrategyWorker(settings, strategy_class, strategy_args, is_add_seeds_mode) + server = WorkerJsonRpcService(worker, settings) + server.start_listening() + worker.run(seeds_url) From 47a001232e97ac8b7982ffc8c36640eb4bfc6b77 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 22 Mar 2018 16:23:45 +0100 Subject: [PATCH 143/273] overused buffer settings # Conflicts: # frontera/contrib/backends/remote/messagebus.py # frontera/core/__init__.py # frontera/settings/default_settings.py --- docs/source/topics/frontera-settings.rst | 37 +++++++++++++++++++ .../contrib/backends/remote/messagebus.py | 7 +++- frontera/core/__init__.py | 7 ++-- frontera/settings/default_settings.py | 2 + 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 8131bb885..249a249e4 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -306,6 +306,43 @@ Default: ``5.0`` (in progress + queued requests in that slot) / max allowed concurrent downloads per slot before slot is considered overused. This affects only Scrapy scheduler." + +.. setting:: OVERUSED_MAX_PER_KEY + +OVERUSED_MAX_PER_KEY +-------------------- + +Default: ``1200`` + +Purging will start When reaching this number of request per key. + +.. setting:: OVERUSED_KEEP_PER_KEY + +OVERUSED_KEEP_PER_KEY +--------------------- + +Default: ``1000`` + +After the purging this number of request will be left in the queue. + +.. setting:: OVERUSED_MAX_KEYS + +OVERUSED_MAX_KEYS +----------------- + +Default: ``1000`` + +Purging of keys will start when count of keys will reach this limit. + +.. setting:: OVERUSED_KEEP_KEYS + +OVERUSED_KEEP_KEYS +------------------ + +Default: ``100`` + +The number of keys for purging to leave. + .. setting:: REQUEST_MODEL REQUEST_MODEL diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index ce3614a13..b0da5a7a4 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -26,8 +26,11 @@ def __init__(self, manager): self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") - self._buffer = OverusedBuffer(self._get_next_requests, settings.get("OVERUSED_MAX_QUEUE_SIZE"), - settings.get("OVERUSED_MAX_KEYS")) + self._buffer = OverusedBuffer(self._get_next_requests, + max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), + keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), + max_keys=settings.get('OVERUSED_MAX_KEYS'), + keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index 0732eab6d..4da3fdb16 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -30,18 +30,17 @@ class OverusedBuffer(object): A buffering object for implementing the buffer of Frontera requests for overused domains/ips. It can be used when customizing backend to address efficient downloader pool usage. """ - def __init__(self, _get_func, max_per_key, max_keys): + def __init__(self, _get_func, max_per_key, keep_per_key, max_keys, keep_keys): """ :param _get_func: reference to get_next_requests() method of binded class - :param log_func: optional logging function, for logging of internal state """ self._pending = defaultdict(deque) self._get = _get_func self._log = getLogger("overusedbuffer") self._max_per_key = max_per_key - self._keep_per_key = int(max_per_key * 0.1) if max_per_key else None + self._keep_per_key = keep_per_key self._max_keys = max_keys - self._keep_keys = int(max_keys * 0.1) if max_keys else None + self._keep_keys = keep_keys def _get_key(self, request, type): return get_slot_key(request, type) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index a37a44204..1da4f1613 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -37,7 +37,9 @@ DOMAINS_BLACKLIST = None OVERUSED_SLOT_FACTOR = 5.0 OVERUSED_MAX_PER_KEY = None +OVERUSED_KEEP_PER_KEY = 1000 OVERUSED_MAX_KEYS = None +OVERUSED_KEEP_KEYS = 100 QUEUE_HOSTNAME_PARTITIONING = False REDIS_BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' REDIS_HOST = 'localhost' From 48fffd4ee8062b700ec3abfac88754c8614e39cc Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 11:31:06 +0100 Subject: [PATCH 144/273] passing args to run() --- frontera/worker/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/worker/stats.py b/frontera/worker/stats.py index 8c33f9044..c99128ac9 100644 --- a/frontera/worker/stats.py +++ b/frontera/worker/stats.py @@ -30,7 +30,7 @@ def __init__(self, settings, *args, **kwargs): self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) self._export_stats_task = LoopingCall(self.export_stats) - def run(self): + def run(self, *args, **kwargs): def errback_export_stats(failure): logger.exception(failure.value) @@ -42,7 +42,7 @@ def errback_export_stats(failure): if self.stats_producer: self._export_stats_task.start(interval=self._stats_interval)\ .addErrback(errback_export_stats) - super(StatsExportMixin, self).run() + super(StatsExportMixin, self).run(*args, **kwargs) def get_stats_tags(self, *args, **kwargs): """Get a tags dictionary for the metrics. From 514e5d376076066fa6f418bd25bcff2a0fcbf331 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 14:05:34 +0100 Subject: [PATCH 145/273] removed seeds addition through spider log # Conflicts: # frontera/contrib/backends/remote/codecs/json.py # frontera/contrib/backends/remote/codecs/msgpack.py # frontera/worker/strategy.py --- docs/source/topics/frontera-settings.rst | 65 ++++++------------- .../contrib/backends/remote/codecs/json.py | 16 ----- .../contrib/backends/remote/codecs/msgpack.py | 5 -- .../contrib/backends/remote/messagebus.py | 5 +- frontera/core/__init__.py | 11 ++-- frontera/core/codec.py | 10 --- frontera/worker/strategies/__init__.py | 4 +- frontera/worker/strategy.py | 39 ++++------- 8 files changed, 41 insertions(+), 114 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 249a249e4..9be95de44 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -273,17 +273,24 @@ Default: ``30.0`` Used in DB worker, and it's a time interval between production of new batches for all partitions. If partition is busy, it will be skipped. +.. setting:: OVERUSED_KEEP_PER_KEY -.. setting:: OVERUSED_MAX_PER_KEY +OVERUSED_KEEP_PER_KEY +--------------------- -OVERUSED_MAX_PER_KEY --------------------- +Default: ``1000`` -Default: ``None`` +After the purging this number of requests will be left in the queue. + + +.. setting:: OVERUSED_KEEP_KEYS + +OVERUSED_KEEP_KEYS +------------------ -The maximum number of keys to store per OverusedBuffer slot. When the specified amount is reached the purging is -performed leaving 0.1 * specified value requests. +Default: ``100`` +The number of keys for purging to leave. .. setting:: OVERUSED_MAX_KEYS @@ -292,20 +299,7 @@ OVERUSED_MAX_KEYS Default: ``None`` -The maximum number of slots in OverusedBuffer. When this limit is reached the purging is performed -leaving 0.1 * specified value keys. - - -.. setting:: OVERUSED_SLOT_FACTOR - -OVERUSED_SLOT_FACTOR --------------------- - -Default: ``5.0`` - -(in progress + queued requests in that slot) / max allowed concurrent downloads per slot before slot is considered -overused. This affects only Scrapy scheduler." - +A threshold triggering the keys purging in OverusedBuffer. The purging will end up leaving :ref:`OVERUSED_KEEP_KEYS` .. setting:: OVERUSED_MAX_PER_KEY @@ -314,34 +308,17 @@ OVERUSED_MAX_PER_KEY Default: ``1200`` -Purging will start When reaching this number of request per key. +Purging will start when reaching this number of requests per key and leave :ref:`OVERUSED_KEEP_PER_KEY` requests. -.. setting:: OVERUSED_KEEP_PER_KEY - -OVERUSED_KEEP_PER_KEY ---------------------- - -Default: ``1000`` - -After the purging this number of request will be left in the queue. - -.. setting:: OVERUSED_MAX_KEYS - -OVERUSED_MAX_KEYS ------------------ - -Default: ``1000`` - -Purging of keys will start when count of keys will reach this limit. - -.. setting:: OVERUSED_KEEP_KEYS +.. setting:: OVERUSED_SLOT_FACTOR -OVERUSED_KEEP_KEYS ------------------- +OVERUSED_SLOT_FACTOR +-------------------- -Default: ``100`` +Default: ``5.0`` -The number of keys for purging to leave. +(in progress + queued requests in that slot) / max allowed concurrent downloads per slot before slot is considered +overused. This affects only Scrapy scheduler." .. setting:: REQUEST_MODEL diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index a2786d6cc..e0271e238 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -91,16 +91,6 @@ def __init__(self, request_model, *a, **kw): self.send_body = kw.pop('send_body', False) super(Encoder, self).__init__(request_model, *a, **kw) - def encode(self, obj): - encoded = _convert_and_save_type(obj) - return super(Encoder, self).encode(encoded) - - def encode_add_seeds(self, seeds): - return self.encode({ - 'type': 'add_seeds', - 'seeds': [_prepare_request_message(seed) for seed in seeds] - }) - def encode_page_crawled(self, response): return self.encode({ 'type': 'page_crawled', @@ -186,12 +176,6 @@ def decode(self, message): return ('request_error', request, message['error']) if message['type'] == 'update_score': return ('update_score', self._request_from_object(message['r']), message['score'], message['schedule']) - if message['type'] == 'add_seeds': - seeds = [] - for seed in message['seeds']: - request = self._request_from_object(seed) - seeds.append(request) - return ('add_seeds', seeds) if message['type'] == 'new_job_id': return ('new_job_id', int(message['job_id'])) if message['type'] == 'offset': diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index a5f3129d6..271a3a998 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -21,9 +21,6 @@ class Encoder(BaseEncoder): def __init__(self, request_model, *a, **kw): self.send_body = True if 'send_body' in kw and kw['send_body'] else False - def encode_add_seeds(self, seeds): - return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True) - def encode_page_crawled(self, response): return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True) @@ -84,8 +81,6 @@ def decode(self, buffer): return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) if obj[0] == b're': return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) - if obj[0] == b'as': - return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) if obj[0] == b'njid': return ('new_job_id', int(obj[1])) if obj[0] == b'of': diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index b0da5a7a4..bfc1bc82d 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -44,10 +44,7 @@ def frontier_stop(self): self.spider_log_producer.flush() def add_seeds(self, seeds): - per_host = aggregate_per_host(seeds) - for host_fprint, host_links in six.iteritems(per_host): - self.spider_log_producer.send(host_fprint, - self._encoder.encode_add_seeds(host_links)) + raise NotImplemented("The seeds addition using spider log isn't allowed") def page_crawled(self, response): host_fprint = get_host_fprint(response) diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index 4da3fdb16..4a490b6c8 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -59,9 +59,7 @@ def _get_pending(self, max_n_requests, overused_set): for key in keys.copy(): try: yield pending[key].popleft() - # contacts-crawler strategy related hack - if self._max_per_key: - self._check_and_purge(key) + self._check_and_purge(key) i += 1 except IndexError: keys.discard(key) @@ -69,7 +67,7 @@ def _get_pending(self, max_n_requests, overused_set): def _check_and_purge(self, key): pending = self._pending[key] - if len(pending) > self._max_per_key: + if self._max_per_key is not None and len(pending) > self._max_per_key: self._log.warning("Purging of key %s, of size %d has started", key, len(pending)) purged = 0 @@ -79,7 +77,7 @@ def _check_and_purge(self, key): self._log.warning("%d requests purged", purged) def _check_and_purge_keys(self): - if len(self._pending) > self._max_keys: + if self._max_keys is not None and len(self._pending) > self._max_keys: self._log.warning("Purging the keys") new_keys = set(sample(self._pending.keys(), self._keep_keys)) keys = set(self._pending.keys()) @@ -93,8 +91,7 @@ def get_next_requests(self, max_n_requests, **kwargs): if self._log.isEnabledFor(DEBUG): self._log.debug("Overused keys: %s", str(kwargs['overused_keys'])) self._log.debug("Pending: %i", self._get_pending_count()) - if self._max_keys: - self._check_and_purge_keys() + self._check_and_purge_keys() overused_set = set(kwargs['overused_keys']) requests = list(self._get_pending(max_n_requests, overused_set)) diff --git a/frontera/core/codec.py b/frontera/core/codec.py index 5e618b611..a2fceb2bc 100644 --- a/frontera/core/codec.py +++ b/frontera/core/codec.py @@ -31,16 +31,6 @@ def decode_request(self, buffer): @six.add_metaclass(ABCMeta) class BaseEncoder(object): - @abstractmethod - def encode_add_seeds(self, seeds): - """ - Encodes add_seeds message - - :param list seeds: A list of frontier Request objects - :return: bytes encoded message - """ - pass - @abstractmethod def encode_page_crawled(self, response): """ diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index ce089f83a..d810d6359 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -37,11 +37,11 @@ def from_worker(cls, manager, args, mb_stream, states_context): return cls(manager, args, mb_stream, states_context) @abstractmethod - def add_seeds(self, seeds): + def add_seeds(self, stream): """ Called when add_seeds event is received from spider log. - :param list seeds: A list of :class:`Request ` objects. + :param file stream: A file-like object containing seed content """ @abstractmethod diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 2bb584290..b900d945d 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -14,6 +14,7 @@ from frontera.core.manager import FrontierManager from frontera.logger.handlers import CONSOLE from frontera.worker.stats import StatsExportMixin +from frontera.worker.server import WorkerJsonRpcService from twisted.internet.task import LoopingCall from twisted.internet import reactor, task @@ -50,19 +51,21 @@ def flush(self): class StatesContext(object): - def __init__(self, states): + def __init__(self, states, debug_mode=False, debug_stream=False): self._requests = [] self._states = states - self._fingerprints = set() + self._fingerprints = dict() + self._debug_stream = debug_stream + self.debug_mode = debug_mode def to_fetch(self, requests): - if isinstance(requests, Iterable): - self._fingerprints.update([x.meta[b'fingerprint'] for x in requests]) - return - self._fingerprints.add(requests.meta[b'fingerprint']) + requests = requests if isinstance(requests, Iterable) else [requests] + for request in requests: + fingerprint = request.meta[b'fingerprint'] + self._fingerprints[fingerprint] = request if self.debug_mode else None def fetch(self): - self._states.fetch(self._fingerprints) + self._states.fetch(self._fingerprints, debug_stream=self._debug_stream) self._fingerprints.clear() def refresh_and_keep(self, requests): @@ -147,10 +150,6 @@ def collect_batch(self): type = msg[0] batch.append(msg) try: - if type == 'add_seeds': - _, seeds = msg - self.states_context.to_fetch(seeds) - continue if type == 'page_crawled': _, response = msg self.states_context.to_fetch(response) @@ -186,13 +185,6 @@ def process_batch(self, batch): for msg in batch: type = msg[0] try: - if type == 'add_seeds': - _, seeds = msg - for seed in seeds: - seed.meta[b'jid'] = self.job_id - self.on_add_seeds(seeds) - self.stats['consumed_add_seeds'] += 1 - continue if type == 'page_crawled': _, response = msg if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: @@ -343,14 +335,6 @@ def _perform_shutdown(self, _=None): except: logger.exception('Error on shutdown') - def on_add_seeds(self, seeds): - logger.debug('Adding %i seeds', len(seeds)) - for seed in seeds: - logger.debug("URL: %s", seed.url) - self.states.set_states(seeds) - self.strategy.add_seeds(seeds) - self.states.update_cache(seeds) - def on_page_crawled(self, response): logger.debug("Page crawled %s", response.url) self.states.set_states([response]) @@ -371,6 +355,9 @@ def on_request_error(self, request, error): self.strategy.page_error(request, error) self.states.update_cache(request) + def set_process_info(self, process_info): + self.process_info = process_info + class StrategyWorker(StatsExportMixin, BaseStrategyWorker): """Main strategy worker class with useful extensions. From 825fee0ecb3d8132bbe87a50c3e5d88c820fbb4e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 14:06:02 +0100 Subject: [PATCH 146/273] codecs test fixed --- tests/test_codecs.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 321489ef2..38fc9b312 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -59,7 +59,6 @@ def check_request(req1, req2): stats = {'_timestamp': 1499241748, 'tags': {'source': 'spider', 'partition_id': 0}, 'crawled_pages_count': 2, 'links_extracted_count': 3} msgs = [ - enc.encode_add_seeds([req]), enc.encode_page_crawled(Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), @@ -74,13 +73,6 @@ def check_request(req1, req2): it = iter(msgs) - o = dec.decode(next(it)) - assert o[0] == 'add_seeds' - assert type(o[1]) == list - req_d = o[1][0] - check_request(req_d, req) - assert type(req_d) == Request - o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response From 92def371acd76b2dbb3241244745a27f3851b738 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 14:21:09 +0100 Subject: [PATCH 147/273] renaming add_seeds to read_seeds(file) --- frontera/worker/strategies/__init__.py | 4 ++-- frontera/worker/strategy.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index d810d6359..4639fa599 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -37,9 +37,9 @@ def from_worker(cls, manager, args, mb_stream, states_context): return cls(manager, args, mb_stream, states_context) @abstractmethod - def add_seeds(self, stream): + def read_seeds(self, stream): """ - Called when add_seeds event is received from spider log. + Called when :term:`strategy worker` is run using add-seeds mode. :param file stream: A file-like object containing seed content """ diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index b900d945d..3aedb4d29 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -245,7 +245,7 @@ def add_seeds(self, seeds_url): fh = urlopen(seeds_url) from io import BufferedReader buffered_stream = BufferedReader(fh) - self.strategy.add_seeds(buffered_stream) + self.strategy.read_seeds(buffered_stream) buffered_stream.close() self.update_score.flush() self.states_context.release() From 514687618c9e509b183dae2f3ad86c4cb32a21c8 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 17:32:59 +0100 Subject: [PATCH 148/273] fixed reading of seeds --- frontera/utils/s3.py | 35 +++++++++++++++++++++++++++++++++++ frontera/worker/strategy.py | 29 +++++++++++++++++++---------- 2 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 frontera/utils/s3.py diff --git a/frontera/utils/s3.py b/frontera/utils/s3.py new file mode 100644 index 000000000..ed3bec4c6 --- /dev/null +++ b/frontera/utils/s3.py @@ -0,0 +1,35 @@ +from botocore.response import StreamingBody +from io import RawIOBase + + +class StreamingBodyIOBase(RawIOBase): + def __init__(self, streaming_body, *args, **kwargs): + assert isinstance(streaming_body, StreamingBody) + self._sb = streaming_body + super(StreamingBodyIOBase, self).__init__(*args, **kwargs) + + def close(self): + self._sb.close() + + def read(self, size=-1): + if size == -1: + size = None + return self._sb.read(size) + + def readable(self, *args, **kwargs): + return self._sb._amount_read < self._sb._content_length + + def tell(self): + return self._sb._amount_read + + def seekable(self, *args, **kwargs): + return False + + def writable(self, *args, **kwargs): + return False + + def isatty(self, *args, **kwargs): + return False + + + diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 3aedb4d29..dcddd47dc 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -232,21 +232,27 @@ def work(self): def add_seeds(self, seeds_url): logger.info("Seeds addition started from url %s", seeds_url) if not seeds_url: - self.strategy.add_seeds(None) + self.strategy.read_seeds(None) else: parsed = urlparse(seeds_url) if parsed.scheme == "s3": import boto3 + from frontera.utils.s3 import StreamingBodyIOBase s3 = boto3.resource("s3") - obj = s3.Object(parsed.hostname, parsed.path) + path = parsed.path.lstrip("/") + obj = s3.Object(parsed.hostname, path) response = obj.get() - fh = response['Body'] + fh = StreamingBodyIOBase(response['Body']) + elif parsed.scheme == "file": + fh = open(parsed.path, "rb") else: - fh = urlopen(seeds_url) - from io import BufferedReader - buffered_stream = BufferedReader(fh) - self.strategy.read_seeds(buffered_stream) - buffered_stream.close() + raise TypeError("Unsupported URL scheme") + self.strategy.read_seeds(fh) + try: + fh.close() + except: + logger.exception("Error during closing of seeds stream") + pass self.update_score.flush() self.states_context.release() @@ -273,8 +279,11 @@ def debug(sig, frame): logger.critical(str("").join(format_stack(frame))) install_shutdown_handlers(self._handle_shutdown) + signal(SIGUSR1, debug) if self.add_seeds_mode: self.add_seeds(seeds_url) + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) else: self.task.start(interval=0).addErrback(errback_main) self._logging_task.start(interval=30) @@ -283,7 +292,6 @@ def debug(sig, frame): logger.info("Starting flush-states task in %d seconds", flush_states_task_delay) task.deferLater(reactor, flush_states_task_delay, run_flush_states_task) - signal(SIGUSR1, debug) reactor.run(installSignalHandlers=False) def log_status(self): @@ -331,7 +339,8 @@ def _perform_shutdown(self, _=None): self._manager.stop() logger.info("Closing message bus.") self.scoring_log_producer.close() - self.consumer.close() + if not self.add_seeds_mode: + self.consumer.close() except: logger.exception('Error on shutdown') From 033fcc55910053efd9718e80d56b9586a5062e70 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 23 Mar 2018 17:42:14 +0100 Subject: [PATCH 149/273] s3 dependency --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 1c357c7bf..e7e997283 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,9 @@ 'distributed': [ 'Twisted' ], + 's3': [ + 'boto3' + ], 'redis': [ 'redis>=2.10.5', 'hiredis>=0.2' From 27985a01bdfe820398fe27e29f1efd204dcd628a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 10 Apr 2018 16:03:09 +0200 Subject: [PATCH 150/273] docs --- frontera/worker/strategies/__init__.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 4639fa599..73662dea1 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -20,6 +20,15 @@ class BaseCrawlingStrategy(object): """ def __init__(self, manager, args, mb_stream, states_context): + """ + Constructor of the crawling strategy. + + Args: + manager: is an instance of :class: `Backend ` instance + args: is a dict with command line arguments from :term:`strategy worker` + mb_stream: is a helper class for sending scheduled requests + states_context: a helper to operate with states for requests created in crawling strategy class + """ self._mb_stream = mb_stream self._states_context = states_context self._manager = manager @@ -29,9 +38,7 @@ def from_worker(cls, manager, args, mb_stream, states_context): """ Called on instantiation in strategy worker. - :param manager: :class: `Backend ` instance - :param args: dict with command line arguments from :term:`strategy worker` - :param mb_stream: :class: `UpdateScoreStream ` instance + see params for constructor :return: new instance """ return cls(manager, args, mb_stream, states_context) @@ -59,6 +66,12 @@ def filter_extracted_links(self, request, links): to links_extracted handler and is aiming to filter unused links and return only those where states information is needed. + The motivation for having the filtration separated before the actual handler is to save on HBase state + retrieval. Every non-cached link is requested from HBase and it may slow down the cluster significantly + on discovery-intensive crawls. Please make sure you use this class to filter out all the links you're not + going ot use in :method:`links_extracted + handler. + :param object request: The :class:`Request ` object for the crawled page. :param list links: A list of :class:`Request ` objects generated from \ the links extracted for the crawled page. @@ -70,7 +83,7 @@ def filter_extracted_links(self, request, links): def links_extracted(self, request, links): """ Called every time document was successfully crawled, and receiving links_extracted event from spider log, - after the links states are fetched from backend. Should be used to schedule links according to some rules. + after the link states are fetched from backend. Should be used to schedule links according to some rules. :param object request: The :class:`Request ` object for the crawled page. :param list links: A list of :class:`Request ` objects generated from \ @@ -114,9 +127,8 @@ def schedule(self, request, score=1.0, dont_queue=False): def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ - Creates request with specified fields, with state fetched from backend. This method only creates request, but - isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states - from storage. + Creates request with specified fields. This method only creates request, but isn't getting it's state + from storage. Use self.refresh_states on a batch of requests to get their states from storage. :param url: str :param method: str From 66347c2d8ad31f83fc691252b3f2e60678065e85 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 20 Apr 2018 15:30:51 +0200 Subject: [PATCH 151/273] slot for hbase queue partitioning --- frontera/contrib/backends/hbase/__init__.py | 24 ++++++++++++--------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index fbc030c20..8e98ac8d9 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -159,15 +159,19 @@ def get_interval(score, resolution): for request, score in batch: domain = request.meta[b'domain'] fingerprint = request.meta[b'fingerprint'] - if type(domain) == dict: + slot = request.meta.get(b'slot') + if slot is not None: + partition_id = self.partitioner.partition(slot, self.partitions) + key_crc32 = get_crc32(slot) + elif type(domain) == dict: partition_id = self.partitioner.partition(domain[b'name'], self.partitions) - host_crc32 = get_crc32(domain[b'name']) + key_crc32 = get_crc32(domain[b'name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash(domain, self.partitions) - host_crc32 = domain + key_crc32 = domain else: - raise TypeError("domain of unknown type.") - item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) + raise TypeError("partitioning key and info isn't provided") + item = (unhexlify(fingerprint), key_crc32, self.encoder.encode_request(request), score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) data.setdefault(rk, []).append((score, item)) @@ -237,12 +241,12 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): stream = BytesIO(buf) unpacker = Unpacker(stream) for item in unpacker: - fprint, host_crc32, _, _ = item - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + fprint, key_crc32, _, _ = item + if key_crc32 not in queue: + queue[key_crc32] = [] + if max_requests_per_host is not None and len(queue[key_crc32]) > max_requests_per_host: continue - queue[host_crc32].append(fprint) + queue[key_crc32].append(fprint) count += 1 if fprint not in meta_map: From fc4dee36fcd5126a4f2d35f8d7ed194e3f31910c Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 26 Apr 2018 10:57:14 +0200 Subject: [PATCH 152/273] support of non-iterable argument --- frontera/worker/strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index dcddd47dc..099cc9587 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -72,7 +72,7 @@ def refresh_and_keep(self, requests): self.to_fetch(requests) self.fetch() self._states.set_states(requests) - self._requests.extend(requests) + self._requests.extend(requests if isinstance(requests, Iterable) else [requests]) def release(self): self._states.update_cache(self._requests) From b8097251692b31601a416a5a613399886fb71067 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 8 May 2018 18:40:31 +0200 Subject: [PATCH 153/273] documented None values --- docs/source/topics/frontera-settings.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 9be95de44..e3e3e48dc 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -299,16 +299,18 @@ OVERUSED_MAX_KEYS Default: ``None`` -A threshold triggering the keys purging in OverusedBuffer. The purging will end up leaving :ref:`OVERUSED_KEEP_KEYS` +A threshold triggering the keys purging in OverusedBuffer. The purging will end up leaving :ref:`OVERUSED_KEEP_KEYS`. +``None`` disables purging. .. setting:: OVERUSED_MAX_PER_KEY OVERUSED_MAX_PER_KEY -------------------- -Default: ``1200`` +Default: ``None`` Purging will start when reaching this number of requests per key and leave :ref:`OVERUSED_KEEP_PER_KEY` requests. +``None`` disables purging. .. setting:: OVERUSED_SLOT_FACTOR From 1e59bf570ed0718602243f5917ca1ece57d384f9 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 10 May 2018 12:23:01 +0200 Subject: [PATCH 154/273] DEFAULT_HBASE_THRIFT_FRAME_SIZE constant --- frontera/contrib/backends/hbase/domaincache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index a928f17cd..36c769ee6 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -7,11 +7,11 @@ import six from frontera.contrib.backends.hbase.utils import HardenedBatch from frontera.utils.msgpack import restruct_for_pack -from happybase.batch import DEFAULT_HBASE_THRIFT_FRAME_SIZE from msgpack import packb, unpackb from w3lib.util import to_bytes, to_native_str DOMAIN_CACHE_BATCH_SIZE = 100 +DEFAULT_HBASE_THRIFT_FRAME_SIZE = 2097152 import collections from cachetools import Cache From 2b1ee27abbcae4d91033062b4ee1c25a9e9d5afb Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 10 May 2018 17:06:45 +0200 Subject: [PATCH 155/273] check for a key presence before deletion --- frontera/contrib/backends/hbase/domaincache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 36c769ee6..0f0a93d88 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -121,7 +121,8 @@ def __getitem__(self, key): self.__setitem__(key, value) else: self.__setitem__(key, value) - del self._second_gen[key] + if key in self._second_gen: # the second gen clean up could be triggered during set in first gen + del self._second_gen[key] else: self._update_order(key) return value From 2211ede8a4011533d5d42cda15e956a2762763fa Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 10 May 2018 17:14:06 +0200 Subject: [PATCH 156/273] disabling scrapy test --- tests/test_scrapy_spider.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_scrapy_spider.py b/tests/test_scrapy_spider.py index 0b797ddcd..e41d09891 100644 --- a/tests/test_scrapy_spider.py +++ b/tests/test_scrapy_spider.py @@ -5,8 +5,9 @@ from scrapy import signals from scrapy.settings import Settings from tests.scrapy_spider.spiders.example import MySpider +import pytest - +@pytest.mark.skip(reason="no way of currently testing this") def test_scrapy_spider(): settings = Settings() settings.setmodule("tests.scrapy_spider.settings") From 7afb27e526cdeb4bb66c874b5db5c0659c88f5d4 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 10 May 2018 17:46:30 +0200 Subject: [PATCH 157/273] codecs tests are fixed --- frontera/contrib/backends/remote/codecs/json.py | 6 +++++- frontera/contrib/backends/remote/codecs/msgpack.py | 4 ++-- tests/test_codecs.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index e0271e238..2c9822950 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -91,6 +91,10 @@ def __init__(self, request_model, *a, **kw): self.send_body = kw.pop('send_body', False) super(Encoder, self).__init__(request_model, *a, **kw) + def encode(self, obj): + encoded = _convert_and_save_type(obj) + return super(Encoder, self).encode(encoded) + def encode_page_crawled(self, response): return self.encode({ 'type': 'page_crawled', @@ -190,4 +194,4 @@ def decode_request(self, message): method=obj['method'], headers=obj['headers'], cookies=obj['cookies'], - meta=obj['meta']) + meta=obj['meta']) \ No newline at end of file diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 271a3a998..0bb1bdc2d 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -87,9 +87,9 @@ def decode(self, buffer): return ('offset', int(obj[1]), int(obj[2])) if obj[0] == b'st': return ('stats', obj[1]) - return TypeError('Unknown message type') + raise TypeError('Unknown message type') def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer)) + return self._request_from_object(unpackb(buffer, encoding='utf-8')) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 38fc9b312..20c511b2b 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -83,7 +83,6 @@ def check_request(req1, req2): o[1].body is None o = dec.decode(next(it)) - print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta @@ -122,6 +121,7 @@ def check_request(req1, req2): with pytest.raises(TypeError): dec.decode(next(it)) + class TestEncodeDecodeJson(unittest.TestCase): """ Test for testing methods `_encode_recursively` and `_decode_recursively` used in json codec From f1b6e012aee6675c06220c2cf6334d91a6d40c6b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 10 May 2018 17:50:15 +0200 Subject: [PATCH 158/273] overusedbuffer test cases are fixed --- tests/test_core_overused_buffer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_core_overused_buffer.py b/tests/test_core_overused_buffer.py index 5a2ae2b06..849a12be0 100644 --- a/tests/test_core_overused_buffer.py +++ b/tests/test_core_overused_buffer.py @@ -30,7 +30,7 @@ def get_once(self, max_n_requests, **kwargs): def test_base(self): self.req_it = iter(self.requests) - ob = OverusedBuffer(self.get_once, 100, 10000) + ob = OverusedBuffer(self.get_once, None, 100, None, 100) assert ob._get_pending_count() == 0 assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], @@ -54,7 +54,7 @@ def test_base(self): def test_purging_keys(self): self.req_it = cycle(self.requests) - ob = OverusedBuffer(self.get_once, 10, 100) + ob = OverusedBuffer(self.get_once, 10, 1, 100, 10) ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], key_type="domain") assert ob._get_pending_count() == 9 @@ -77,7 +77,7 @@ def get_random_host(): def test_purging_keys_set(self): self.generate_requests() self.req_it = cycle(self.requests) - ob = OverusedBuffer(self.get_once, 1000, 10) + ob = OverusedBuffer(self.get_once, 1000, 100, 10, 1) ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") assert (ob._get_key_count()) == 10 From b0be022c78c5765a4351ce1744b2205a9ce528f9 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 12:01:24 +0200 Subject: [PATCH 159/273] more work to codecs fix --- frontera/utils/msgpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/utils/msgpack.py b/frontera/utils/msgpack.py index 60467bfba..5f77f8607 100644 --- a/frontera/utils/msgpack.py +++ b/frontera/utils/msgpack.py @@ -4,7 +4,7 @@ def restruct_for_pack(obj): """Recursively walk object's hierarchy.""" if isinstance(obj, six.text_type): - return obj.encode('utf8') + return obj if isinstance(obj, (bool, six.integer_types, float, six.binary_type)): return obj elif isinstance(obj, dict): From 017b70eed53b3dcb0bf5e86c842e02c06043ae04 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 12:02:03 +0200 Subject: [PATCH 160/273] batch_size as argument and bug fix + new test case --- frontera/contrib/backends/hbase/domaincache.py | 8 ++++---- tests/test_domain_cache.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 0f0a93d88..088353929 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -10,7 +10,6 @@ from msgpack import packb, unpackb from w3lib.util import to_bytes, to_native_str -DOMAIN_CACHE_BATCH_SIZE = 100 DEFAULT_HBASE_THRIFT_FRAME_SIZE = 2097152 import collections @@ -83,20 +82,21 @@ class DomainCache(LRUCache): MAX_VALUE_SIZE = int(DEFAULT_HBASE_THRIFT_FRAME_SIZE * 0.95) LOG_INTERVAL = 60.0 - def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None): + def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None, batch_size=100): super(DomainCache, self).__init__(maxsize) self._second_gen = dict() table_name = to_bytes(table_name) self._table = self._get_domain_table(connection, table_name) - self._batch = HardenedBatch(self._table, batch_size=DOMAIN_CACHE_BATCH_SIZE) + self._batch = HardenedBatch(self._table, batch_size=batch_size) self._set_fields = set(set_fields) if set_fields else set() self._on_get_func = on_get_func self.logger = logging.getLogger("domain-cache") self.stats = defaultdict(int) self.next_log = time() + self.LOG_INTERVAL + self.batch_size = batch_size # Primary methods @@ -165,7 +165,7 @@ def popitem(self): key, value = super(DomainCache, self).popitem() self._second_gen[key] = value self.stats["pops"] += 1 - if len(self._second_gen) >= DOMAIN_CACHE_BATCH_SIZE: + if len(self._second_gen) >= self.batch_size: self._flush_second_gen() self._second_gen.clear() self.stats["flushes"]+=1 diff --git a/tests/test_domain_cache.py b/tests/test_domain_cache.py index e4f60c693..02cc070e1 100644 --- a/tests/test_domain_cache.py +++ b/tests/test_domain_cache.py @@ -51,6 +51,18 @@ def test_domain_cache_setdefault(self): dc.flush() assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]} + def test_domain_cache_setdefault_with_second_gen_flush(self): + dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3) + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + dc.setdefault('d1', {})['domain'] += 1 + + assert dc.setdefault('d1', {}) == {'domain': 2} + def test_empty_key(self): dc = DomainCache(2, self.conn, 'domain_metadata') with self.assertRaises(KeyError): From 4e81956fe433b061218fbf36fc5496d29a5068a4 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 13:24:03 +0200 Subject: [PATCH 161/273] table creation --- tests/test_domain_cache.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_domain_cache.py b/tests/test_domain_cache.py index 02cc070e1..13dd77123 100644 --- a/tests/test_domain_cache.py +++ b/tests/test_domain_cache.py @@ -9,6 +9,10 @@ class TestDomainCache(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker", table_prefix="contacts", table_prefix_separator=":") + if 'domain_metadata' not in self.conn.tables(): + self.conn.create_table('domain_metadata', { + 'm': {'max_versions': 1, 'block_cache_enabled': 1,} + }) t = self.conn.table('domain_metadata') t.delete('d1') t.delete('d2') From 167ba3e1a4695675370ffab23e70c8851e99d921 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 13:26:44 +0200 Subject: [PATCH 162/273] removed add_seeds --- tests/test_message_bus_backend.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_message_bus_backend.py b/tests/test_message_bus_backend.py index 5a0818836..587a8c100 100644 --- a/tests/test_message_bus_backend.py +++ b/tests/test_message_bus_backend.py @@ -38,12 +38,6 @@ def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zer settings.SPIDER_PARTITION_ID = -1 self.assertRaises(ValueError, self.mbb_setup, settings) - def test_add_seeds(self): - mbb = self.mbb_setup() - mbb.add_seeds([r1, r2, r3]) - seeds = [mbb._decoder.decode(m)[1][0] for m in mbb.spider_log_producer.messages] - self.assertEqual(set([seed.url for seed in seeds]), set([r1.url, r2.url, r3.url])) - def test_page_crawled(self): mbb = self.mbb_setup() resp = Response(r1.url, body='body', request=r1) From 775dfbb5ccd63dcafd769bbdf8ac6301ab06dff5 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 14:01:59 +0200 Subject: [PATCH 163/273] crawling strategy test fix --- tests/test_strategy.py | 44 ++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 1b39c5756..8c11326f7 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- -from unittest import TestCase - -from frontera import FrontierManager -from frontera.core.components import States -from frontera.settings import Settings from frontera.worker.strategies import BaseCrawlingStrategy from frontera.worker.strategy import StatesContext +from frontera.settings import Settings +from frontera.core.manager import FrontierManager + +from frontera.contrib.backends.memory import MemoryStates +from frontera.core.components import States -class DummyCrawlingStrategy(BaseCrawlingStrategy): +class TestingCrawlingStrategy(BaseCrawlingStrategy): def add_seeds(self, seeds): pass @@ -21,6 +21,9 @@ def page_error(self, request, error): def links_extracted(self, request, links): pass + def filter_extracted_links(self, request, links): + pass + class MessageBusStream(object): def send(self, request, score=1.0, dont_queue=False): @@ -30,27 +33,30 @@ def flush(self): pass -class TestCrawlingStrategy(TestCase): - def setUp(self): +class TestCrawlingStrategy(object): + def strategy(self): settings = Settings() - settings.BACKEND = "frontera.contrib.backends.memory.MemoryDistributedBackend" - self.manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) - self.stream = MessageBusStream() - self.states_ctx = StatesContext(self.manager.backend.states) - self.strategy = DummyCrawlingStrategy(self.manager, self.stream, self.states_ctx) + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) + stream = MessageBusStream() + states = MemoryStates(10) + states_ctx = StatesContext(states) + return TestingCrawlingStrategy.from_worker(manager, stream, states_ctx) def test_create_request(self): - req = self.strategy.create_request("http://test.com/someurl") + s = self.strategy() + req = s.create_request("http://test.com/someurl") assert req.meta[b'fingerprint'] == b'955ac04f1b1a96de60a5139ad90c80be87822159' def test_states_refresh(self): - states = self.manager.backend.states + s = self.strategy() + states = s._states_context._states url = "http://test.com/someurl" - req1 = self.strategy.create_request(url) + req1 = s.create_request(url) req1.meta[b'state'] = States.CRAWLED states.update_cache(req1) - req2 = self.strategy.create_request(url) - self.strategy.refresh_states([req2]) + req2 = s.create_request(url) + s.refresh_states([req2]) assert req2.meta[b'state'] == req1.meta[b'state'] - assert req2.meta[b'state'] == States.CRAWLED + assert req2.meta[b'state'] == States.CRAWLED \ No newline at end of file From 5f5d29f5d3c7b1d87dbf54fa30b083f7e7d9661b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 17:04:14 +0200 Subject: [PATCH 164/273] table presence fix --- tests/test_domain_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_domain_cache.py b/tests/test_domain_cache.py index 13dd77123..d2d6b9c60 100644 --- a/tests/test_domain_cache.py +++ b/tests/test_domain_cache.py @@ -9,7 +9,7 @@ class TestDomainCache(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) self.conn = Connection(host="hbase-docker", table_prefix="contacts", table_prefix_separator=":") - if 'domain_metadata' not in self.conn.tables(): + if b'domain_metadata' not in self.conn.tables(): self.conn.create_table('domain_metadata', { 'm': {'max_versions': 1, 'block_cache_enabled': 1,} }) From 662faad8b79ed529cae34287b6066457c0e94d43 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 17:17:48 +0200 Subject: [PATCH 165/273] add seeds run mode test, sw test fix --- frontera/worker/strategies/bfs.py | 14 +++++----- frontera/worker/strategy.py | 10 +++---- tests/mocks/message_bus.py | 3 +++ tests/test_worker_strategy.py | 43 +++++++++++++++++++++---------- 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/frontera/worker/strategies/bfs.py b/frontera/worker/strategies/bfs.py index 6e9b20e00..eb9a8b8cd 100644 --- a/frontera/worker/strategies/bfs.py +++ b/frontera/worker/strategies/bfs.py @@ -6,12 +6,14 @@ class CrawlingStrategy(BaseCrawlingStrategy): - - def add_seeds(self, seeds): - for seed in seeds: - if seed.meta[b'state'] is States.NOT_CRAWLED: - seed.meta[b'state'] = States.QUEUED - self.schedule(seed) + def read_seeds(self, fh): + for url in fh: + url = url.strip() + req = self.create_request(url) + self.refresh_states(req) + if req.meta[b'state'] is States.NOT_CRAWLED: + req.meta[b'state'] = States.QUEUED + self.schedule(req) def page_crawled(self, response): response.meta[b'state'] = States.CRAWLED diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 099cc9587..54779be63 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -51,21 +51,19 @@ def flush(self): class StatesContext(object): - def __init__(self, states, debug_mode=False, debug_stream=False): + def __init__(self, states): self._requests = [] self._states = states self._fingerprints = dict() - self._debug_stream = debug_stream - self.debug_mode = debug_mode def to_fetch(self, requests): requests = requests if isinstance(requests, Iterable) else [requests] for request in requests: fingerprint = request.meta[b'fingerprint'] - self._fingerprints[fingerprint] = request if self.debug_mode else None + self._fingerprints[fingerprint] = request def fetch(self): - self._states.fetch(self._fingerprints, debug_stream=self._debug_stream) + self._states.fetch(self._fingerprints) self._fingerprints.clear() def refresh_and_keep(self, requests): @@ -112,7 +110,7 @@ def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): self.update_score = UpdateScoreStream(self.scoring_log_producer, self._encoder) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) + self.strategy = strategy_class.from_worker(self._manager, strategy_args, self.update_score, self.states_context) self.states = self._manager.backend.states self.stats = { 'consumed_since_start': 0, diff --git a/tests/mocks/message_bus.py b/tests/mocks/message_bus.py index dd4be971c..6057e174c 100644 --- a/tests/mocks/message_bus.py +++ b/tests/mocks/message_bus.py @@ -42,6 +42,9 @@ def flush(self): def get_offset(self, partition_id): return self.offset + def close(self): + pass + class ScoringLogStream(BaseScoringLogStream): diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 415b9c0ed..1a00e218e 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -4,6 +4,8 @@ from frontera.core.models import Request, Response from frontera.core.components import States from unittest import TestCase +from os import remove +from os.path import exists r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0}) r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0}) @@ -22,29 +24,42 @@ def setUp(self): settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - self.sw = StrategyWorker(settings, CrawlingStrategy) + self.sw = StrategyWorker(settings, CrawlingStrategy, None, None) + + def tearDown(self): + if exists("/tmp/test_urls.txt"): + remove("/tmp/test_urls.txt") + pass def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - return StrategyWorker(settings, FilteredLinksCrawlingStrategy) + return StrategyWorker(settings, FilteredLinksCrawlingStrategy, None, None) + + def sw_setup_add_seeds(self): + settings = Settings() + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 + return StrategyWorker(settings, CrawlingStrategy, None, True) def test_add_seeds(self): - sw = self.sw - msg = sw._encoder.encode_add_seeds([r1, r2, r3, r4]) - sw.consumer.put_messages([msg]) - r2.meta[b'state'] = States.CRAWLED - sw.states.update_cache([r2]) - sw.work() + sw = self.sw_setup_add_seeds() + fh = open("/tmp/test_urls.txt", "wb") + fh.write(b"http://example1.com/\n") + fh.write(b"http://www.scrapy.org/\n") + fh.close() - r1.meta[b'state'] = States.QUEUED - r3.meta[b'state'] = States.QUEUED - r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set([sw._encoder.encode_update_score(r, 1.0, True) - for r in [r1, r3, r4]]) + sw.run("file:///tmp/test_urls.txt") + + assert sw.add_seeds_mode == True + produced = [sw._decoder.decode(msg) for msg in sw.update_score._producer.messages] + assert len(produced) == 2 + assert all(msg[0] == 'update_score' for msg in produced) + assert produced[0][1].url == "http://example1.com/" + assert produced[1][1].url == "http://www.scrapy.org/" def test_page_crawled(self): sw = self.sw From 934b2f8765b54565b75c16cbdc48133be1188029 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 17:30:18 +0200 Subject: [PATCH 166/273] memory backends tests fix --- frontera/contrib/backends/memory/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index 456807fa9..2ca2dc261 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -10,7 +10,6 @@ from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.utils.url import parse_domain_from_url_fast import six -from six.moves import map from six.moves import range @@ -248,8 +247,10 @@ def __init__(self, manager): super(MemoryDFSOverusedBackend, self).__init__(manager) settings = manager.settings self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests, - settings.get("OVERUSED_MAX_QUEUE_SIZE"), - settings.get("OVERUSED_MAX_KEYS")) + settings.get("OVERUSED_MAX_PER_KEY"), + settings.get("OVERUSED_KEEP_PER_KEY"), + settings.get("OVERUSED_MAX_KEYS"), + settings.get("OVERUSED_KEEP_KEYS")) def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) From 95c46a3ce5057f0be1bd9ac1528c8800cbda81d3 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 17:59:50 +0200 Subject: [PATCH 167/273] don't use hbase namespace for testing --- tests/test_domain_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_domain_cache.py b/tests/test_domain_cache.py index d2d6b9c60..34eb733c8 100644 --- a/tests/test_domain_cache.py +++ b/tests/test_domain_cache.py @@ -8,7 +8,7 @@ class TestDomainCache(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) - self.conn = Connection(host="hbase-docker", table_prefix="contacts", table_prefix_separator=":") + self.conn = Connection(host="hbase-docker") if b'domain_metadata' not in self.conn.tables(): self.conn.create_table('domain_metadata', { 'm': {'max_versions': 1, 'block_cache_enabled': 1,} From d9464c5d62a9202f337894d4ad0a1fc59877bd52 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 11 May 2018 18:01:07 +0200 Subject: [PATCH 168/273] strategy test case fix --- tests/test_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 8c11326f7..aa6b6c6f8 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -41,7 +41,7 @@ def strategy(self): stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) - return TestingCrawlingStrategy.from_worker(manager, stream, states_ctx) + return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx) def test_create_request(self): s = self.strategy() From 92f3f544d57bd9ae4f4cc6548aabfff2863c19e5 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 14 May 2018 15:46:32 +0200 Subject: [PATCH 169/273] removed add_seeds --- tests/test_worker_db.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index dce263072..4b78b244e 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -2,6 +2,7 @@ from frontera.worker.db import DBWorker, ScoringConsumer, IncomingConsumer, BatchGenerator from frontera.settings import Settings from frontera.core.components import States +import unittest r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0}) @@ -9,7 +10,7 @@ r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0}) -class TestDBWorker(object): +class TestDBWorker(unittest.TestCase): def dbw_setup(self, distributed=False): settings = Settings() @@ -19,15 +20,7 @@ def dbw_setup(self, distributed=False): settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' - return DBWorker(settings, False, False, False) - - def test_add_seeds(self): - dbw = self.dbw_setup() - msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) - incoming_consumer = dbw.slot.components[IncomingConsumer] - incoming_consumer.spider_log_consumer.put_messages([msg]) - incoming_consumer.run() - assert set([r.url for r in incoming_consumer.backend.seeds]) == set([r.url for r in [r1, r2, r3]]) + return DBWorker(settings, False, False, False, partitions="0") def test_page_crawled(self): dbw = self.dbw_setup() @@ -80,7 +73,7 @@ def test_new_batch(self): def test_offset(self): dbw = self.dbw_setup(True) - incoming_worker = dbw.slot.components[IncomingConsumer] + incoming_worker = dbw.slot.components[IncomingConsumer batch_gen = dbw.slot.components[BatchGenerator] batch_gen.spider_feed = incoming_worker.spider_feed batch_gen.spider_feed_producer = incoming_worker.spider_feed_producer From 1a4f8c82f0243b084524bbbdd9a60edb5f6a24bb Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 14 May 2018 15:54:06 +0200 Subject: [PATCH 170/273] syntax --- tests/test_worker_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index 4b78b244e..47fc6f3ec 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -73,7 +73,7 @@ def test_new_batch(self): def test_offset(self): dbw = self.dbw_setup(True) - incoming_worker = dbw.slot.components[IncomingConsumer + incoming_worker = dbw.slot.components[IncomingConsumer] batch_gen = dbw.slot.components[BatchGenerator] batch_gen.spider_feed = incoming_worker.spider_feed batch_gen.spider_feed_producer = incoming_worker.spider_feed_producer From 0ffb2ab345d29356850e542fdbb3de80f9ec9a51 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 15 May 2018 13:49:50 +0200 Subject: [PATCH 171/273] first draft version --- frontera/contrib/backends/hbase/__init__.py | 55 +++--- .../contrib/scrapy/schedulers/frontier.py | 3 +- frontera/core/manager.py | 161 +++++++++++++++--- frontera/settings/default_settings.py | 2 + frontera/utils/add_seeds.py | 38 +++++ frontera/utils/managers.py | 4 - frontera/worker/strategies/__init__.py | 31 +++- frontera/worker/strategy.py | 86 ++-------- tests/test_strategy.py | 2 +- 9 files changed, 253 insertions(+), 129 deletions(-) create mode 100644 frontera/utils/add_seeds.py diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index 8e98ac8d9..43fa5fdbc 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -496,28 +496,43 @@ def __init__(self, manager): self._queue = None self._states = None - @classmethod - def strategy_worker(cls, manager): - o = cls(manager) - settings = manager.settings - o._states = HBaseState(connection=o.connection, + def _init_states(self, settings): + self._states = HBaseState(connection=self.connection, table_name=settings.get('HBASE_STATES_TABLE'), cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) + + def _init_queue(self, settings): + self._queue = HBaseQueue(self.connection, self.queue_partitions, + settings.get('HBASE_QUEUE_TABLE'), drop=settings.get('HBASE_DROP_ALL_TABLES'), + use_snappy=settings.get('HBASE_USE_SNAPPY')) + + def _init_metadata(self, settings): + self._metadata = HBaseMetadata(self.connection, settings.get('HBASE_METADATA_TABLE'), + settings.get('HBASE_DROP_ALL_TABLES'), + settings.get('HBASE_USE_SNAPPY'), + settings.get('HBASE_BATCH_SIZE'), + settings.get('STORE_CONTENT')) + + @classmethod + def strategy_worker(cls, manager): + o = cls(manager) + o._init_states(manager.settings) return o @classmethod def db_worker(cls, manager): o = cls(manager) - settings = manager.settings - drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') - o._queue = HBaseQueue(o.connection, o.queue_partitions, - settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables, - use_snappy=settings.get('HBASE_USE_SNAPPY')) - o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, - settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), - settings.get('STORE_CONTENT')) + o._init_queue(manager.settings) + o._init_metadata(manager.settings) + return o + + @classmethod + def local(cls, manager): + o = cls(manager) + o._init_queue(manager.settings) + o._init_states(manager.settings) return o @property @@ -560,16 +575,16 @@ def finished(self): def get_next_requests(self, max_next_requests, **kwargs): self.logger.debug("Querying queue table.") - for partition_id in set(kwargs.pop('partitions', [])): - count = 0 - for request in self.queue.get_next_requests( + results = [] + for partition_id in set(kwargs.pop('partitions', [i for i in range(self.queue_partitions)])): + requests = self.queue.get_next_requests( max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, - max_requests_per_host=self._max_requests_per_host): - count += 1 - yield request - self.logger.debug("Got %d requests for partition id %d", count, partition_id) + max_requests_per_host=self._max_requests_per_host) + results.extend(requests) + self.logger.debug("Got %d requests for partition id %d", len(requests), partition_id) + return results def get_stats(self): """Helper to get stats dictionary for the backend. diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index a9ca35d52..a677392fc 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -107,7 +107,8 @@ def process_spider_output(self, response, result, spider): for element in result: if isinstance(element, Request): links.append(element) - yield element + else: + yield element frontier_request = response.meta[b'frontier_request'] self.frontier.page_crawled(response) # removed frontier part from .meta # putting it back, to persist .meta from original request diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 1aa488ec1..06c98fd0c 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -1,16 +1,21 @@ from __future__ import absolute_import -from collections import OrderedDict +import logging +from abc import ABCMeta, abstractmethod +from collections import Iterable + +import six + +from frontera.core import models +from frontera.core.components import Backend, DistributedBackend, Middleware, CanonicalSolver from frontera.exceptions import NotConfigured +from frontera.settings import Settings from frontera.utils.misc import load_object -from frontera.settings import Settings, BaseSettings -from frontera.core.components import Backend, DistributedBackend, Middleware, CanonicalSolver -from frontera.core import models -import logging class ComponentsPipelineMixin(object): - def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): + def __init__(self, backend, strategy_class, strategy_args, middlewares=None, canonicalsolver=None, db_worker=False, + strategy_worker=False): self._logger_components = logging.getLogger("manager.components") # Load middlewares @@ -25,6 +30,14 @@ def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=Fa # Load backend self._logger_components.debug("Loading backend '%s'", backend) self._backend = self._load_backend(backend, db_worker, strategy_worker) + self._backend.frontier_start() + + # Instantiate strategy + self._scoring_stream = LocalUpdateScoreStream(self.backend.queue) + self._states_context = StatesContext(self.backend.states) + if isinstance(strategy_class, str): + strategy_class = load_object(strategy_class) + self._strategy = strategy_class.from_worker(self, strategy_args, self._scoring_stream, self._states_context) @property def canonicalsolver(self): @@ -48,8 +61,13 @@ def backend(self): Can be defined with :setting:`BACKEND` setting. """ return self._backend + + @property + def strategy(self): + return self._strategy def _load_backend(self, backend, db_worker, strategy_worker): + # FIXME remove obsolete cls = load_object(backend) assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ if issubclass(cls, DistributedBackend): @@ -57,10 +75,10 @@ def _load_backend(self, backend, db_worker, strategy_worker): return cls.db_worker(self) if strategy_worker: return cls.strategy_worker(self) - raise RuntimeError("Distributed backends are meant to be used in workers.") + return cls.local(self) else: assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ - "subclasses are allowed to use." + "subclasses are allowed to use" if hasattr(cls, 'from_manager'): return cls.from_manager(self) else: @@ -112,6 +130,11 @@ def _process_component(self, component, method_name, component_category, obj, re return_obj.__class__.__name__) return return_obj + def close(self): + self.strategy.close() + self._states_context.flush() + self.backend.frontier_stop() + class BaseManager(object): def __init__(self, request_model, response_model, settings=None): @@ -187,9 +210,9 @@ class FrontierManager(BaseManager, ComponentsPipelineMixin): providing an API to interact with. It's also responsible of loading and communicating all different frontier components. """ - def __init__(self, request_model, response_model, backend, middlewares=None, test_mode=False, max_requests=0, - max_next_requests=0, auto_start=True, settings=None, canonicalsolver=None, db_worker=False, - strategy_worker=False): + def __init__(self, request_model, response_model, backend, strategy_class, strategy_args, middlewares=None, + test_mode=False, max_requests=0, max_next_requests=0, auto_start=True, settings=None, + canonicalsolver=None, db_worker=False, strategy_worker=False): """ :param object/string request_model: The :class:`Request ` object to be \ used by the frontier. @@ -242,7 +265,8 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes # Manager finished flag self._finished = False - ComponentsPipelineMixin.__init__(self, backend=backend, middlewares=middlewares, + ComponentsPipelineMixin.__init__(self, backend=backend, strategy_class=strategy_class, + strategy_args=strategy_args, middlewares=middlewares, canonicalsolver=canonicalsolver, db_worker=db_worker, strategy_worker=strategy_worker) @@ -251,7 +275,7 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes self._components_pipeline = [ ('Middleware', self.middlewares, True), ('CanonicalSolver', self.canonicalsolver, False), - ('Backend', self.backend, False) + ('Strategy', self.strategy, False) ] # Log frontier manager start @@ -276,6 +300,8 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): return FrontierManager(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, middlewares=manager_settings.MIDDLEWARES, test_mode=manager_settings.TEST_MODE, max_requests=manager_settings.MAX_REQUESTS, @@ -340,7 +366,7 @@ def finished(self): Boolean value indicating if the frontier has finished. See :ref:`Finish conditions `. """ if not self._finished: - return self.backend.finished() + return self.strategy.finished() return True def start(self): @@ -367,24 +393,16 @@ def stop(self): self._process_components(method_name='frontier_stop') self._stopped = True - def add_seeds(self, seeds): + def add_seeds(self, seeds_file): """ - Adds a list of seed requests (seed URLs) as entry point for the crawl. + Performs seeds addition procedure. Using file-like object, calls read_seeds method of crawling strategy. - :param list seeds: A list of :class:`Request ` objects. + :param file seeds_file: A file-like object passed to read_seeds :return: None. """ self._check_startstop() - # FIXME probably seeds should be a generator here - assert len(seeds), "Empty seeds list" - for seed in seeds: - assert isinstance(seed, self._request_model), "Seed objects must subclass '%s', '%s' found" % \ - (self._request_model.__name__, type(seed).__name__) - self._logger.debug('ADD_SEEDS urls_length=%d', len(seeds)) - self._process_components(method_name='add_seeds', - obj=seeds, - return_classes=(list,)) # TODO: Dar vuelta + self.strategy.read_seeds(seeds_file) def get_next_requests(self, max_next_requests=0, **kwargs): """ @@ -453,9 +471,13 @@ def page_crawled(self, response): type(response.request).__name__) assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ (self.response_model.__name__, type(response).__name__) + self._states_context.to_fetch(response) + self._states_context.fetch() + self._states_context._states.set_states(response) self._process_components(method_name='page_crawled', obj=response, return_classes=self.response_model) + self._states_context._states.update_cache(response) def links_extracted(self, request, links): """ @@ -477,8 +499,22 @@ def links_extracted(self, request, links): self._process_components(method_name='links_extracted', obj=request, return_classes=self.request_model, + components=(0, 1), links=links) + filtered = self.strategy.filter_extracted_links(request, links) + if filtered: + self._states_context.to_fetch(request) + self._states_context.to_fetch(filtered) + self._states_context.fetch() + self._states_context._states.set_states(filtered) + self._process_components(method_name='links_extracted', + obj=request, + return_classes=self.request_model, + components=(2,), + links=filtered) + self._states_context._states.update_cache(filtered) + def request_error(self, request, error): """ Informs the frontier about a page crawl error. An error identifier must be provided. @@ -490,10 +526,14 @@ def request_error(self, request, error): """ self._check_startstop() self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) + self._states_context.to_fetch(request) + self._states_context.fetch() + self._states_context._states.set_states(request) processed_page = self._process_components(method_name='request_error', obj=request, return_classes=self.request_model, error=error) + self._states_context._states.update_cache(request) return processed_page def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): @@ -514,6 +554,75 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No return_classes=self.request_model, components=(0,1)) + + def _check_startstop(self): assert self._started, "Frontier not started!" assert not self._stopped, "Call to stopped frontier!" + + +@six.add_metaclass(ABCMeta) +class UpdateScoreStream(object): + + @abstractmethod + def send(self, request, score=1.0, dont_queue=False): + pass + + def flush(self): + pass + + +class MessageBusUpdateScoreStream(UpdateScoreStream): + def __init__(self, producer, encoder): + self._producer = producer + self._encoder = encoder + + def send(self, request, score=1.0, dont_queue=False): + encoded = self._encoder.encode_update_score( + request=request, + score=score, + schedule=not dont_queue + ) + self._producer.send(None, encoded) + + +class LocalUpdateScoreStream(UpdateScoreStream): + def __init__(self, queue): + self._queue = queue + + def send(self, request, score=1.0, dont_queue=False): + self._queue.schedule([(request.meta[b'fingerprint'], score, request, not dont_queue)]) + + +class StatesContext(object): + + def __init__(self, states): + self._requests = [] + self._states = states + self._fingerprints = dict() + self.logger = logging.getLogger("states-context") + + def to_fetch(self, requests): + requests = requests if isinstance(requests, Iterable) else [requests] + for request in requests: + fingerprint = request.meta[b'fingerprint'] + self._fingerprints[fingerprint] = request + + def fetch(self): + self._states.fetch(self._fingerprints) + self._fingerprints.clear() + + def refresh_and_keep(self, requests): + self.to_fetch(requests) + self.fetch() + self._states.set_states(requests) + self._requests.extend(requests if isinstance(requests, Iterable) else [requests]) + + def release(self): + self._states.update_cache(self._requests) + self._requests = [] + + def flush(self): + self.logger.info("Flushing states") + self._states.flush() + self.logger.info("Flushing of states finished") \ No newline at end of file diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 1da4f1613..28fbe8cee 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -67,6 +67,8 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False +STRATEGY = "frontera.worker.strategies.bfs.CrawlingStrategy" +STRATEGY_ARGS = {} SW_FLUSH_INTERVAL = 300 TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False diff --git a/frontera/utils/add_seeds.py b/frontera/utils/add_seeds.py new file mode 100644 index 000000000..9fff305cf --- /dev/null +++ b/frontera/utils/add_seeds.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from frontera.core.manager import FrontierManager +from frontera.settings import Settings +from frontera.logger.handlers import CONSOLE +from argparse import ArgumentParser +import logging +from logging.config import fileConfig +from os.path import exists + + +logger = logging.getLogger(__name__) + +parser = ArgumentParser(description="Frontera local add seeds utility") +parser.add_argument('--config', type=str, required=True, + help='Settings module name, should be accessible by import') +parser.add_argument('--log-level', '-L', type=str, default='INFO', + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") +parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") +args = parser.parse_args() +settings = Settings(module=args.config) +logging_config_path = settings.get("LOGGING_CONFIG") +if logging_config_path and exists(logging_config_path): + fileConfig(logging_config_path, disable_existing_loggers=False) +else: + logging.basicConfig(level=args.log_level) + logger.setLevel(args.log_level) + logger.addHandler(CONSOLE) + +fh = open(args.seeds_file, "rb") + +logger.info("Starting local seeds addition from file %s", args.seeds_file) + +manager = FrontierManager.from_settings(settings) +manager.add_seeds(fh) +manager.stop() +manager.close() + +logger.info("Seeds addition finished") \ No newline at end of file diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 867f7a4f6..2f4c8b638 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -24,10 +24,6 @@ def start(self): def stop(self): self.manager.stop() - def add_seeds(self, seeds): - frontier_seeds = [self.request_converter.to_frontier(seed) for seed in seeds] - self.manager.add_seeds(seeds=frontier_seeds) - def get_next_requests(self, max_next_requests=0, **kwargs): frontier_requests = self.manager.get_next_requests(max_next_requests=max_next_requests, **kwargs) return [self.request_converter.from_frontier(frontier_request) for frontier_request in frontier_requests] diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 73662dea1..894266f11 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -19,29 +19,29 @@ class BaseCrawlingStrategy(object): After exiting from all of these methods states from meta field are passed back and stored in the backend. """ - def __init__(self, manager, args, mb_stream, states_context): + def __init__(self, manager, args, scheduled_stream, states_context): """ Constructor of the crawling strategy. Args: manager: is an instance of :class: `Backend ` instance args: is a dict with command line arguments from :term:`strategy worker` - mb_stream: is a helper class for sending scheduled requests + scheduled_stream: is a helper class for sending scheduled requests states_context: a helper to operate with states for requests created in crawling strategy class """ - self._mb_stream = mb_stream + self._scheduled_stream = scheduled_stream self._states_context = states_context self._manager = manager @classmethod - def from_worker(cls, manager, args, mb_stream, states_context): + def from_worker(cls, manager, args, scheduled_stream, states_context): """ Called on instantiation in strategy worker. see params for constructor :return: new instance """ - return cls(manager, args, mb_stream, states_context) + return cls(manager, args, scheduled_stream, states_context) @abstractmethod def read_seeds(self, stream): @@ -112,7 +112,7 @@ def close(self): """ Called when strategy worker is about to close crawling strategy. """ - self._mb_stream.flush() + self._scheduled_stream.flush() self._states_context.release() def schedule(self, request, score=1.0, dont_queue=False): @@ -123,7 +123,7 @@ def schedule(self, request, score=1.0, dont_queue=False): :param score: float from 0.0 to 1.0 :param dont_queue: bool, True - if no need to schedule, only update the score """ - self._mb_stream.send(request, score, dont_queue) + self._scheduled_stream.send(request, score, dont_queue) def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ @@ -147,3 +147,20 @@ def refresh_states(self, requests): :param requests: list(:class:`Request `) """ self._states_context.refresh_and_keep(requests) + + def request_error(self, request, error): + """ + DEPRECATED. + + Convenience method, called by FronteraManager, please use page_error() instead. + + :param request: :class:`Request ` + :param error: str with error description + """ + self.page_error(request, error) + + def frontier_start(self): + pass + + def frontier_stop(self): + pass \ No newline at end of file diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 54779be63..6bc51ccf4 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -1,87 +1,33 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from time import asctime + import logging -from traceback import format_stack, format_tb -from signal import signal, SIGUSR1 -from logging.config import fileConfig from argparse import ArgumentParser +from binascii import hexlify +from logging.config import fileConfig from os.path import exists from random import randint -from frontera.utils.misc import load_object -from frontera.utils.ossignal import install_shutdown_handlers - -from frontera.core.manager import FrontierManager -from frontera.logger.handlers import CONSOLE -from frontera.worker.stats import StatsExportMixin -from frontera.worker.server import WorkerJsonRpcService +from signal import signal, SIGUSR1 +from time import asctime +from traceback import format_stack, format_tb -from twisted.internet.task import LoopingCall +import six +from six.moves.urllib.parse import urlparse from twisted.internet import reactor, task from twisted.internet.defer import Deferred +from twisted.internet.task import LoopingCall +from frontera.core.manager import FrontierManager, MessageBusUpdateScoreStream, StatesContext +from frontera.logger.handlers import CONSOLE from frontera.settings import Settings -from collections import Iterable -from binascii import hexlify -import six -from six.moves.urllib.parse import urlparse -from six.moves.urllib.request import urlopen - +from frontera.utils.misc import load_object +from frontera.utils.ossignal import install_shutdown_handlers +from frontera.worker.server import WorkerJsonRpcService +from frontera.worker.stats import StatsExportMixin logger = logging.getLogger("strategy-worker") -class UpdateScoreStream(object): - - def __init__(self, producer, encoder): - self._producer = producer - self._encoder = encoder - - def send(self, request, score=1.0, dont_queue=False): - encoded = self._encoder.encode_update_score( - request=request, - score=score, - schedule=not dont_queue - ) - self._producer.send(None, encoded) - - def flush(self): - pass - - -class StatesContext(object): - - def __init__(self, states): - self._requests = [] - self._states = states - self._fingerprints = dict() - - def to_fetch(self, requests): - requests = requests if isinstance(requests, Iterable) else [requests] - for request in requests: - fingerprint = request.meta[b'fingerprint'] - self._fingerprints[fingerprint] = request - - def fetch(self): - self._states.fetch(self._fingerprints) - self._fingerprints.clear() - - def refresh_and_keep(self, requests): - self.to_fetch(requests) - self.fetch() - self._states.set_states(requests) - self._requests.extend(requests if isinstance(requests, Iterable) else [requests]) - - def release(self): - self._states.update_cache(self._requests) - self._requests = [] - - def flush(self): - logger.info("Flushing states") - self._states.flush() - logger.info("Flushing of states finished") - - class BaseStrategyWorker(object): """Base strategy worker class.""" @@ -107,7 +53,7 @@ def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) self._encoder = encoder_cls(self._manager.request_model) - self.update_score = UpdateScoreStream(self.scoring_log_producer, self._encoder) + self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder) self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, strategy_args, self.update_score, self.states_context) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index aa6b6c6f8..f1b58b2f5 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -2,7 +2,7 @@ from frontera.worker.strategies import BaseCrawlingStrategy from frontera.worker.strategy import StatesContext from frontera.settings import Settings -from frontera.core.manager import FrontierManager +from frontera.core.manager import FrontierManager, StatesContext from frontera.contrib.backends.memory import MemoryStates from frontera.core.components import States From d701f552c42917bbea632fa9787eca40b5359311 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 23 May 2018 10:40:25 +0200 Subject: [PATCH 172/273] batched workflow extracted --- frontera/worker/strategy.py | 222 ++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 108 deletions(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 6bc51ccf4..de13bb5ad 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -10,6 +10,7 @@ from signal import signal, SIGUSR1 from time import asctime from traceback import format_stack, format_tb +from collections import defaultdict import six from six.moves.urllib.parse import urlparse @@ -28,6 +29,111 @@ logger = logging.getLogger("strategy-worker") +class BatchedWorkflow(object): + def __init__(self, strategy, states_context, scoring_stream, stats, job_id): + self.strategy = strategy + self.states_context = states_context + self.scoring_stream = scoring_stream + self.stats = stats + self.job_id = job_id + + self._batch = [] + + def collection_start(self): + self._batch = [] + + def process(self): + self.states_context.fetch() + for event in self._batch: + typ = event[0] + try: + if typ == 'page_crawled': + _, response = event + if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: + continue + self._on_page_crawled(response) + self.stats['consumed_page_crawled'] += 1 + continue + if typ == 'links_extracted': + _, request, links = event + if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + continue + self._on_links_extracted(request, links) + self.stats['consumed_links_extracted'] += 1 + continue + if typ == 'request_error': + _, request, error = event + if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + continue + self._on_request_error(request, error) + self.stats['consumed_request_error'] += 1 + continue + self._on_unknown_message(event) + except Exception as exc: + logger.exception(exc) + pass + self.scoring_stream.flush() + self.states_context.release() + + def collect(self, event): + typ = event[0] + self._batch.append(event) + try: + if typ == 'page_crawled': + _, response = event + self.states_context.to_fetch(response) + return + if typ == 'links_extracted': + _, request, links = event + self.states_context.to_fetch(request) + filtered_links = self.strategy.filter_extracted_links(request, links) + if filtered_links: + # modify last message with a new links list + self._batch[-1] = (typ, request, filtered_links) + self.states_context.to_fetch(filtered_links) + else: + # drop last message if nothing to process + self._batch.pop() + self.stats['dropped_links_extracted'] += 1 + return + if typ == 'request_error': + _, request, error = event + self.states_context.to_fetch(request) + return + if typ == 'offset': + return + self._collect_unknown_event(event) + except: + logger.exception("Error during event collection") + pass + + def _collect_unknown_event(self, msg): + logger.debug('Unknown message %s', msg) + + def _on_unknown_message(self, msg): + pass + + def _on_page_crawled(self, response): + logger.debug("Page crawled %s", response.url) + self.states_context.states.set_states([response]) + self.strategy.page_crawled(response) + self.states_context.states.update_cache(response) + + def _on_links_extracted(self, request, links): + logger.debug("Links extracted %s (%d)", request.url, len(links)) + for link in links: + logger.debug("URL: %s", link.url) + self.states_context.states.set_states(links) + self.strategy.links_extracted(request, links) + self.states_context.states.update_cache(links) + + def _on_request_error(self, request, error): + logger.debug("Page error %s (%s)", request.url, error) + self.states_context.states.set_states(request) + self.strategy.page_error(request, error) + self.states_context.states.update_cache(request) + + class BaseStrategyWorker(object): """Base strategy worker class.""" @@ -58,109 +164,29 @@ def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.strategy = strategy_class.from_worker(self._manager, strategy_args, self.update_score, self.states_context) self.states = self._manager.backend.states - self.stats = { - 'consumed_since_start': 0, - 'consumed_add_seeds': 0, - 'consumed_page_crawled': 0, - 'consumed_links_extracted': 0, - 'consumed_request_error': 0, - 'dropped_links_extracted': 0, - } - self.job_id = 0 + self.stats = defaultdict(int) + self.workflow = BatchedWorkflow(self.strategy, self.states_context, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info("Strategy worker is initialized and consuming partition %d", partition_id) - def collect_unknown_message(self, msg): - logger.debug('Unknown message %s', msg) - - def on_unknown_message(self, msg): - pass - - def collect_batch(self): + def work(self): consumed = 0 - batch = [] + self.workflow.collection_start() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: - msg = self._decoder.decode(m) + event = self._decoder.decode(m) except (KeyError, TypeError) as e: - logger.error("Decoding error:") - logger.exception(e) + logger.exception("Decoding error") logger.debug("Message %s", hexlify(m)) continue else: - type = msg[0] - batch.append(msg) - try: - if type == 'page_crawled': - _, response = msg - self.states_context.to_fetch(response) - continue - if type == 'links_extracted': - _, request, links = msg - self.states_context.to_fetch(request) - filtered_links = self.strategy.filter_extracted_links(request, links) - if filtered_links: - # modify last message with a new links list - batch[-1] = (type, request, filtered_links) - self.states_context.to_fetch(filtered_links) - else: - # drop last message if nothing to process - batch.pop() - self.stats['dropped_links_extracted'] += 1 - continue - if type == 'request_error': - _, request, error = msg - self.states_context.to_fetch(request) - continue - if type == 'offset': - continue - self.collect_unknown_message(msg) - except Exception as exc: - logger.exception(exc) - pass + self.workflow.collect(event) finally: consumed += 1 - return (batch, consumed) - - def process_batch(self, batch): - for msg in batch: - type = msg[0] - try: - if type == 'page_crawled': - _, response = msg - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: - continue - self.on_page_crawled(response) - self.stats['consumed_page_crawled'] += 1 - continue - if type == 'links_extracted': - _, request, links = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self.on_links_extracted(request, links) - self.stats['consumed_links_extracted'] += 1 - continue - if type == 'request_error': - _, request, error = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self.on_request_error(request, error) - self.stats['consumed_request_error'] += 1 - continue - self.on_unknown_message(msg) - except Exception as exc: - logger.exception(exc) - pass - - def work(self): - batch, consumed = self.collect_batch() - self.states_context.fetch() - self.process_batch(batch) - self.update_score.flush() - self.states_context.release() + self.workflow.process() # Exiting, if crawl is finished if self.strategy.finished(): @@ -288,26 +314,6 @@ def _perform_shutdown(self, _=None): except: logger.exception('Error on shutdown') - def on_page_crawled(self, response): - logger.debug("Page crawled %s", response.url) - self.states.set_states([response]) - self.strategy.page_crawled(response) - self.states.update_cache(response) - - def on_links_extracted(self, request, links): - logger.debug("Links extracted %s (%d)", request.url, len(links)) - for link in links: - logger.debug("URL: %s", link.url) - self.states.set_states(links) - self.strategy.links_extracted(request, links) - self.states.update_cache(links) - - def on_request_error(self, request, error): - logger.debug("Page error %s (%s)", request.url, error) - self.states.set_states(request) - self.strategy.page_error(request, error) - self.states.update_cache(request) - def set_process_info(self, process_info): self.process_info = process_info From 4a272125dbf7998e9d60065eca1404da846833cd Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 23 May 2018 11:02:32 +0200 Subject: [PATCH 173/273] more work --- frontera/worker/strategy.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index de13bb5ad..c352f11d7 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -68,9 +68,9 @@ def process(self): self._on_request_error(request, error) self.stats['consumed_request_error'] += 1 continue - self._on_unknown_message(event) - except Exception as exc: - logger.exception(exc) + self.on_unknown_event(event) + except: + logger.exception("Exception during processing") pass self.scoring_stream.flush() self.states_context.release() @@ -102,15 +102,15 @@ def collect(self, event): return if typ == 'offset': return - self._collect_unknown_event(event) + self.collect_unknown_event(event) except: logger.exception("Error during event collection") pass - def _collect_unknown_event(self, msg): - logger.debug('Unknown message %s', msg) + def collect_unknown_event(self, event): + logger.debug('Unknown message %s', event) - def _on_unknown_message(self, msg): + def on_unknown_event(self, event): pass def _on_page_crawled(self, response): From c6c159b23e18631a4dc141d0f9b0a80e4b8d9a6c Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 23 May 2018 19:56:57 +0200 Subject: [PATCH 174/273] FrontierManager refactoring, more to come --- frontera/core/manager.py | 256 +++++++++++++++++++++++++++--------- frontera/utils/managers.py | 4 +- frontera/worker/strategy.py | 32 +++-- 3 files changed, 210 insertions(+), 82 deletions(-) diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 06c98fd0c..77d96c106 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -13,9 +13,67 @@ from frontera.utils.misc import load_object -class ComponentsPipelineMixin(object): - def __init__(self, backend, strategy_class, strategy_args, middlewares=None, canonicalsolver=None, db_worker=False, - strategy_worker=False): +class BackendMixin(object): + def __init__(self, backend, db_worker=False, strategy_worker=False): + # Load backend + self._logger_components.debug("Loading backend '%s'", backend) + self._backend = self._load_backend(backend, db_worker, strategy_worker) + self._backend.frontier_start() + + def _load_backend(self, backend, db_worker, strategy_worker): + # FIXME remove obsolete + cls = load_object(backend) + assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ + if issubclass(cls, DistributedBackend): + if db_worker: + return cls.db_worker(self) + if strategy_worker: + return cls.strategy_worker(self) + return cls.local(self) + else: + assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ + "subclasses are allowed to use" + if hasattr(cls, 'from_manager'): + return cls.from_manager(self) + else: + return cls() + + @property + def backend(self): + """ + The :class:`Backend ` object to be used by the frontier. \ + Can be defined with :setting:`BACKEND` setting. + """ + return self._backend + + def close(self): + self.backend.frontier_stop() + + +class StrategyMixin(object): + def __init__(self, strategy_class, strategy_args, scoring_stream): + self._scoring_stream = scoring_stream if scoring_stream else LocalUpdateScoreStream(self.backend.queue) + self._states_context = StatesContext(self.backend.states) + if isinstance(strategy_class, str): + strategy_class = load_object(strategy_class) + self._strategy = strategy_class.from_worker(self, strategy_args, self._scoring_stream, self._states_context) + + @property + def strategy(self): + return self._strategy + + @property + def states_context(self): + return self._states_context + + def close(self): + self.strategy.close() + self.states_context.flush() + + +class ComponentsPipelineMixin(BackendMixin, StrategyMixin): + def __init__(self, backend, strategy_class, strategy_args, scoring_stream, middlewares=None, canonicalsolver=None, + db_worker=False, strategy_worker=False): self._logger_components = logging.getLogger("manager.components") # Load middlewares @@ -26,18 +84,10 @@ def __init__(self, backend, strategy_class, strategy_args, middlewares=None, can self._canonicalsolver = self._load_object(canonicalsolver) assert isinstance(self.canonicalsolver, CanonicalSolver), \ "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ - - # Load backend - self._logger_components.debug("Loading backend '%s'", backend) - self._backend = self._load_backend(backend, db_worker, strategy_worker) - self._backend.frontier_start() - - # Instantiate strategy - self._scoring_stream = LocalUpdateScoreStream(self.backend.queue) - self._states_context = StatesContext(self.backend.states) - if isinstance(strategy_class, str): - strategy_class = load_object(strategy_class) - self._strategy = strategy_class.from_worker(self, strategy_args, self._scoring_stream, self._states_context) + BackendMixin.__init__(self, backend, db_worker, strategy_worker) + if not db_worker: + # TODO Distributed Scrapy case + StrategyMixin.__init__(self, strategy_class, strategy_args, scoring_stream) @property def canonicalsolver(self): @@ -54,36 +104,6 @@ def middlewares(self): """ return self._middlewares - @property - def backend(self): - """ - The :class:`Backend ` object to be used by the frontier. \ - Can be defined with :setting:`BACKEND` setting. - """ - return self._backend - - @property - def strategy(self): - return self._strategy - - def _load_backend(self, backend, db_worker, strategy_worker): - # FIXME remove obsolete - cls = load_object(backend) - assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ - if issubclass(cls, DistributedBackend): - if db_worker: - return cls.db_worker(self) - if strategy_worker: - return cls.strategy_worker(self) - return cls.local(self) - else: - assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ - "subclasses are allowed to use" - if hasattr(cls, 'from_manager'): - return cls.from_manager(self) - else: - return cls() - def _load_middlewares(self, middleware_names): # TO-DO: Use dict for middleware ordering mws = [] @@ -131,9 +151,8 @@ def _process_component(self, component, method_name, component_category, obj, re return return_obj def close(self): - self.strategy.close() - self._states_context.flush() - self.backend.frontier_stop() + StrategyMixin.close(self) + BackendMixin.close(self) class BaseManager(object): @@ -204,7 +223,7 @@ def settings(self): return self._settings -class FrontierManager(BaseManager, ComponentsPipelineMixin): +class LocalFrontierManager(BaseManager, ComponentsPipelineMixin): """ The :class:`FrontierManager ` object encapsulates the whole frontier, providing an API to interact with. It's also responsible of loading and communicating all different frontier @@ -297,7 +316,7 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): :ref:`frontier default settings ` are used. """ manager_settings = Settings.object_from(settings) - return FrontierManager(request_model=manager_settings.REQUEST_MODEL, + return LocalFrontierManager(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, backend=manager_settings.BACKEND, strategy_class=manager_settings.STRATEGY, @@ -473,11 +492,11 @@ def page_crawled(self, response): (self.response_model.__name__, type(response).__name__) self._states_context.to_fetch(response) self._states_context.fetch() - self._states_context._states.set_states(response) + self._states_context.states.set_states(response) self._process_components(method_name='page_crawled', obj=response, return_classes=self.response_model) - self._states_context._states.update_cache(response) + self._states_context.states.update_cache(response) def links_extracted(self, request, links): """ @@ -507,13 +526,13 @@ def links_extracted(self, request, links): self._states_context.to_fetch(request) self._states_context.to_fetch(filtered) self._states_context.fetch() - self._states_context._states.set_states(filtered) + self._states_context.states.set_states(filtered) self._process_components(method_name='links_extracted', obj=request, return_classes=self.request_model, components=(2,), links=filtered) - self._states_context._states.update_cache(filtered) + self._states_context.states.update_cache(filtered) def request_error(self, request, error): """ @@ -528,12 +547,12 @@ def request_error(self, request, error): self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) self._states_context.to_fetch(request) self._states_context.fetch() - self._states_context._states.set_states(request) + self._states_context.states.set_states(request) processed_page = self._process_components(method_name='request_error', obj=request, return_classes=self.request_model, error=error) - self._states_context._states.update_cache(request) + self._states_context.states.update_cache(request) return processed_page def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): @@ -554,13 +573,124 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No return_classes=self.request_model, components=(0,1)) - - def _check_startstop(self): assert self._started, "Frontier not started!" assert not self._stopped, "Call to stopped frontier!" +class WorkerFrontierManager(BaseManager, ComponentsPipelineMixin): + """ + The :class:`WorkerFrontierManager ` class role is to + instantiate the core components and is used mainly by workers. + """ + def __init__(self, settings, request_model, response_model, backend, strategy_class, strategy_args, + max_next_requests, scoring_stream, middlewares=None, canonicalsolver=None, db_worker=False, + strategy_worker=False): + """ + :param object/string request_model: The :class:`Request ` object to be \ + used by the frontier. + + :param object/string response_model: The :class:`Response ` object to be \ + used by the frontier. + + :param object/string backend: The :class:`Backend ` object to be \ + used by the frontier. + + :param list middlewares: A list of :class:`Middleware ` \ + objects to be used by the frontier. + + :param int max_next_requests: Maximum number of requests returned by \ + :attr:`get_next_requests ` method. + + :param object/string settings: The :class:`Settings ` object used by \ + the frontier. + + :param object/string canonicalsolver: The :class:`CanonicalSolver ` + object to be used by frontier. + :param object scoring_stream: Instance of :class:`UpdateScoreStream ` + for crawling strategy to send scheduled requests to. + + :param bool db_worker: True if class is instantiated in DB worker environment + + :param bool strategy_worker: True if class is instantiated in strategy worker environment + """ + + BaseManager.__init__(self, request_model, response_model, settings=settings) + + self._max_next_requests = max_next_requests + + ComponentsPipelineMixin.__init__(self, backend=backend, strategy_class=strategy_class, + strategy_args=strategy_args, scoring_stream=scoring_stream, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=db_worker,strategy_worker=strategy_worker) + + # Init frontier components pipeline + # Some code relies on the order, modify carefully + self._components_pipeline = [ + ('Middleware', self.middlewares, True), + ('CanonicalSolver', self.canonicalsolver, False), + ] + + # Log frontier manager start + self._logger.info('Frontier Manager Started!') + self._logger.info('-'*80) + + @classmethod + def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): + manager_settings = Settings.object_from(settings) + return WorkerFrontierManager(request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, + middlewares=manager_settings.MIDDLEWARES, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER, + db_worker=db_worker, + strategy_worker=strategy_worker, + scoring_stream=scoring_stream) + + def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): + """ + Creates request and applies middleware and canonical solver pipelines. + + :param url: str + :param method: bytes + :param headers: dict + :param cookies: dict + :param meta: dict + :param body: bytes + :return: :class:`Request ` object + """ + r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) + return self._process_components('create_request', + obj=r, + return_classes=self.request_model, + components=(0, 1)) + + +class SpiderFrontierManager(LocalFrontierManager): + def __init__(self, *args, **kwargs): + super(SpiderFrontierManager, self).__init__(*args, **kwargs) + + @classmethod + def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): + manager_settings = Settings.object_from(settings) + return SpiderFrontierManager(request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, + middlewares=manager_settings.MIDDLEWARES, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER, + db_worker=db_worker, + strategy_worker=strategy_worker, + scoring_stream=scoring_stream) + + @six.add_metaclass(ABCMeta) class UpdateScoreStream(object): @@ -598,7 +728,7 @@ class StatesContext(object): def __init__(self, states): self._requests = [] - self._states = states + self.states = states self._fingerprints = dict() self.logger = logging.getLogger("states-context") @@ -609,20 +739,20 @@ def to_fetch(self, requests): self._fingerprints[fingerprint] = request def fetch(self): - self._states.fetch(self._fingerprints) + self.states.fetch(self._fingerprints) self._fingerprints.clear() def refresh_and_keep(self, requests): self.to_fetch(requests) self.fetch() - self._states.set_states(requests) + self.states.set_states(requests) self._requests.extend(requests if isinstance(requests, Iterable) else [requests]) def release(self): - self._states.update_cache(self._requests) + self.states.update_cache(self._requests) self._requests = [] def flush(self): self.logger.info("Flushing states") - self._states.flush() + self.states.flush() self.logger.info("Flushing of states finished") \ No newline at end of file diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 2f4c8b638..61561fe2d 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -1,11 +1,11 @@ from __future__ import absolute_import -from frontera.core.manager import FrontierManager +from frontera.core.manager import LocalFrontierManager from .converters import BaseRequestConverter, BaseResponseConverter class FrontierManagerWrapper(object): def __init__(self, settings, manager=None): - manager = manager or FrontierManager + manager = manager or LocalFrontierManager self.manager = manager.from_settings(settings) self.request_converter = None self.response_converter = None diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index c352f11d7..1274a468a 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -18,7 +18,7 @@ from twisted.internet.defer import Deferred from twisted.internet.task import LoopingCall -from frontera.core.manager import FrontierManager, MessageBusUpdateScoreStream, StatesContext +from frontera.core.manager import WorkerFrontierManager, MessageBusUpdateScoreStream, StatesContext from frontera.logger.handlers import CONSOLE from frontera.settings import Settings from frontera.utils.misc import load_object @@ -30,9 +30,9 @@ class BatchedWorkflow(object): - def __init__(self, strategy, states_context, scoring_stream, stats, job_id): - self.strategy = strategy - self.states_context = states_context + def __init__(self, manager, scoring_stream, stats, job_id): + self.strategy = manager.strategy + self.states_context = manager.states_context self.scoring_stream = scoring_stream self.stats = stats self.job_id = job_id @@ -137,7 +137,7 @@ def _on_request_error(self, request, error): class BaseStrategyWorker(object): """Base strategy worker class.""" - def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): + def __init__(self, settings, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") @@ -152,20 +152,17 @@ def __init__(self, settings, strategy_class, strategy_args, is_add_seeds_mode): self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() - self._manager = FrontierManager.from_settings(settings, strategy_worker=True) + manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") - self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) - self._encoder = encoder_cls(self._manager.request_model) + self._decoder = decoder_cls(manager.request_model, manager.response_model) + self._encoder = encoder_cls(manager.request_model) self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder) - self.states_context = StatesContext(self._manager.backend.states) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.strategy = strategy_class.from_worker(self._manager, strategy_args, self.update_score, self.states_context) - self.states = self._manager.backend.states self.stats = defaultdict(int) - self.workflow = BatchedWorkflow(self.strategy, self.states_context, self.update_score, self.stats, 0) + self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) @@ -347,11 +344,11 @@ def setup_environment(): "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) - strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') + strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") - strategy_class = load_object(strategy_classpath) + settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: @@ -364,6 +361,7 @@ def setup_environment(): for arg in args.args: key, _, value = arg.partition("=") strategy_args[key] = value if value else None + settings.set("STRATEGY_ARGS", strategy_args) logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): @@ -373,12 +371,12 @@ def setup_environment(): logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - return settings, strategy_class, args.add_seeds, strategy_args, args.seeds_url + return settings, args.add_seeds, args.seeds_url if __name__ == '__main__': - settings, strategy_class, is_add_seeds_mode, strategy_args, seeds_url = setup_environment() - worker = StrategyWorker(settings, strategy_class, strategy_args, is_add_seeds_mode) + settings, is_add_seeds_mode, seeds_url = setup_environment() + worker = StrategyWorker(settings, is_add_seeds_mode) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run(seeds_url) From fde25fcd6725350568dd8e801a412dfbebd63b78 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 25 May 2018 08:41:21 +0200 Subject: [PATCH 175/273] proper init hierarchy --- frontera/__init__.py | 1 - frontera/core/manager.py | 287 ++++++++++++++++++++---------------- frontera/worker/db.py | 5 +- frontera/worker/strategy.py | 2 +- 4 files changed, 164 insertions(+), 131 deletions(-) diff --git a/frontera/__init__.py b/frontera/__init__.py index 8f97ddc24..fe024fd87 100644 --- a/frontera/__init__.py +++ b/frontera/__init__.py @@ -1,5 +1,4 @@ from __future__ import absolute_import -from .core.manager import FrontierManager from .core.models import Request, Response from .core.components import Backend, DistributedBackend, Middleware from .settings import Settings diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 77d96c106..2d45fdcce 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -71,9 +71,8 @@ def close(self): self.states_context.flush() -class ComponentsPipelineMixin(BackendMixin, StrategyMixin): - def __init__(self, backend, strategy_class, strategy_args, scoring_stream, middlewares=None, canonicalsolver=None, - db_worker=False, strategy_worker=False): +class ComponentsPipelineMixin(BackendMixin): + def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): self._logger_components = logging.getLogger("manager.components") # Load middlewares @@ -85,9 +84,6 @@ def __init__(self, backend, strategy_class, strategy_args, scoring_stream, middl assert isinstance(self.canonicalsolver, CanonicalSolver), \ "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ BackendMixin.__init__(self, backend, db_worker, strategy_worker) - if not db_worker: - # TODO Distributed Scrapy case - StrategyMixin.__init__(self, strategy_class, strategy_args, scoring_stream) @property def canonicalsolver(self): @@ -151,11 +147,21 @@ def _process_component(self, component, method_name, component_category, obj, re return return_obj def close(self): - StrategyMixin.close(self) BackendMixin.close(self) + super(ComponentsPipelineMixin, self).close() -class BaseManager(object): +class StrategyComponentsPipelineMixin(ComponentsPipelineMixin, StrategyMixin): + def __init__(self, backend, strategy_class, strategy_args, scoring_stream, **kwargs): + super(StrategyComponentsPipelineMixin, self).__init__(backend, **kwargs) + StrategyMixin.__init__(self, strategy_class, strategy_args, scoring_stream) + + def close(self): + StrategyMixin.close(self) + super(StrategyComponentsPipelineMixin, self).close() + + +class BaseContext(object): def __init__(self, request_model, response_model, settings=None): # Settings @@ -181,7 +187,7 @@ def __init__(self, request_model, response_model, settings=None): @classmethod def from_settings(cls, settings=None): manager_settings = Settings(settings) - return BaseManager(request_model=manager_settings.REQUEST_MODEL, + return BaseContext(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, settings=manager_settings) @@ -223,7 +229,76 @@ def settings(self): return self._settings -class LocalFrontierManager(BaseManager, ComponentsPipelineMixin): +class BaseManager(object): + def get_next_requests(self, max_next_requests=0, **kwargs): + """ + Returns a list of next requests to be crawled. Optionally a maximum number of pages can be passed. If no + value is passed, \ + :attr:`FrontierManager.max_next_requests ` + will be used instead. (:setting:`MAX_NEXT_REQUESTS` setting). + + :param int max_next_requests: Maximum number of requests to be returned by this method. + :param dict kwargs: Arbitrary arguments that will be passed to backend. + + :return: list of :class:`Request ` objects. + """ + + # log (in) + self._logger.debug('GET_NEXT_REQUESTS(in) max_next_requests=%s', max_next_requests) + + # get next requests + next_requests = self.backend.get_next_requests(max_next_requests, **kwargs) + + # log (out) + self._logger.debug('GET_NEXT_REQUESTS(out) returned_requests=%s', len(next_requests)) + return next_requests + + def page_crawled(self, response): + """ + Informs the frontier about the crawl result. + + :param object response: The :class:`Response ` object for the crawled page. + + :return: None. + """ + self._logger.debug('PAGE_CRAWLED url=%s status=%s', response.url, response.status_code) + self._process_components(method_name='page_crawled', + obj=response, + return_classes=self.response_model) + + def links_extracted(self, request, links): + """ + Informs the frontier about extracted links for the request. + + :param object request: The :class:`Request ` object from which the links where crawled. + :param list links: A list of :class:`Request ` objects generated from the links \ + extracted for the request. + + :return: None. + """ + self._logger.debug('LINKS_EXTRACTED url=%s links=%d', request.url, len(links)) + self._process_components(method_name='links_extracted', + obj=request, + return_classes=self.request_model, + components=(0, 1), + links=links) + + def links_extracted_after(self, request, filtered): + self._process_components(method_name='links_extracted', + obj=request, + return_classes=self.request_model, + components=(2,), + links=filtered) + + def request_error(self, request, error): + self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) + return self._process_components(method_name='request_error', + obj=request, + return_classes=self.request_model, + error=error) + + +class LocalFrontierManager(BaseContext, StrategyComponentsPipelineMixin, BaseManager): """ The :class:`FrontierManager ` object encapsulates the whole frontier, providing an API to interact with. It's also responsible of loading and communicating all different frontier @@ -231,7 +306,7 @@ class LocalFrontierManager(BaseManager, ComponentsPipelineMixin): """ def __init__(self, request_model, response_model, backend, strategy_class, strategy_args, middlewares=None, test_mode=False, max_requests=0, max_next_requests=0, auto_start=True, settings=None, - canonicalsolver=None, db_worker=False, strategy_worker=False): + canonicalsolver=None): """ :param object/string request_model: The :class:`Request ` object to be \ used by the frontier. @@ -261,13 +336,9 @@ def __init__(self, request_model, response_model, backend, strategy_class, strat :param object/string canonicalsolver: The :class:`CanonicalSolver ` object to be used by frontier. - - :param bool db_worker: True if class is instantiated in DB worker environment - - :param bool strategy_worker: True if class is instantiated in strategy worker environment """ - BaseManager.__init__(self, request_model, response_model, settings=settings) + BaseContext.__init__(self, request_model, response_model, settings=settings) # Test mode self._test_mode = test_mode @@ -284,10 +355,9 @@ def __init__(self, request_model, response_model, backend, strategy_class, strat # Manager finished flag self._finished = False - ComponentsPipelineMixin.__init__(self, backend=backend, strategy_class=strategy_class, - strategy_args=strategy_args, middlewares=middlewares, - canonicalsolver=canonicalsolver, db_worker=db_worker, - strategy_worker=strategy_worker) + StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, None, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=False, strategy_worker=False) # Init frontier components pipeline # Some code relies on the order, modify carefully @@ -317,19 +387,17 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): """ manager_settings = Settings.object_from(settings) return LocalFrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - strategy_class=manager_settings.STRATEGY, - strategy_args=manager_settings.STRATEGY_ARGS, - middlewares=manager_settings.MIDDLEWARES, - test_mode=manager_settings.TEST_MODE, - max_requests=manager_settings.MAX_REQUESTS, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - auto_start=manager_settings.AUTO_START, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker) + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, + middlewares=manager_settings.MIDDLEWARES, + test_mode=manager_settings.TEST_MODE, + max_requests=manager_settings.MAX_REQUESTS, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + auto_start=manager_settings.AUTO_START, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER) @property def test_mode(self): @@ -452,12 +520,8 @@ def get_next_requests(self, max_next_requests=0, **kwargs): if self.n_requests+max_next_requests > self.max_requests: max_next_requests = self.max_requests - self.n_requests - # log (in) - self._logger.debug('GET_NEXT_REQUESTS(in) max_next_requests=%s n_requests=%s/%s', - max_next_requests, self.n_requests, self.max_requests or '-') - # get next requests - next_requests = self.backend.get_next_requests(max_next_requests, **kwargs) + next_requests = super(LocalFrontierManager, self).get_next_requests(max_next_requests, **kwargs) # Increment requests counter self._n_requests += len(next_requests) @@ -466,21 +530,10 @@ def get_next_requests(self, max_next_requests=0, **kwargs): if next_requests: self._iteration += 1 - # log (out) - self._logger.debug('GET_NEXT_REQUESTS(out) returned_requests=%s n_requests=%s/%s', - len(next_requests), self.n_requests, self.max_requests or '-') return next_requests def page_crawled(self, response): - """ - Informs the frontier about the crawl result. - - :param object response: The :class:`Response ` object for the crawled page. - - :return: None. - """ self._check_startstop() - self._logger.debug('PAGE_CRAWLED url=%s status=%s', response.url, response.status_code) assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ (self.response_model.__name__, type(response).__name__) assert hasattr(response, 'request') and response.request, "Empty response request" @@ -490,49 +543,28 @@ def page_crawled(self, response): type(response.request).__name__) assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ (self.response_model.__name__, type(response).__name__) - self._states_context.to_fetch(response) - self._states_context.fetch() - self._states_context.states.set_states(response) - self._process_components(method_name='page_crawled', - obj=response, - return_classes=self.response_model) - self._states_context.states.update_cache(response) + self.states_context.to_fetch(response) + self.states_context.fetch() + self.states_context.states.set_states(response) + super(LocalFrontierManager, self).page_crawled(response) + self.states_context.states.update_cache(response) def links_extracted(self, request, links): - """ - Informs the frontier about extracted links for the request. - - :param object request: The :class:`Request ` object from which the links where crawled. - :param list links: A list of :class:`Request ` objects generated from the links \ - extracted for the request. - - :return: None. - """ self._check_startstop() - self._logger.debug('LINKS_EXTRACTED url=%s links=%d', request.url, len(links)) assert isinstance(request, self.request_model), "Request object must subclass '%s', '%s' found" % \ (self.request_model.__name__, type(request).__name__) for link in links: assert isinstance(link, self._request_model), "Link objects must subclass '%s', '%s' found" % \ (self._request_model.__name__, type(link).__name__) - self._process_components(method_name='links_extracted', - obj=request, - return_classes=self.request_model, - components=(0, 1), - links=links) - + super(LocalFrontierManager, self).links_extracted(request, links) filtered = self.strategy.filter_extracted_links(request, links) if filtered: - self._states_context.to_fetch(request) - self._states_context.to_fetch(filtered) - self._states_context.fetch() - self._states_context.states.set_states(filtered) - self._process_components(method_name='links_extracted', - obj=request, - return_classes=self.request_model, - components=(2,), - links=filtered) - self._states_context.states.update_cache(filtered) + self.states_context.to_fetch(request) + self.states_context.to_fetch(filtered) + self.states_context.fetch() + self.states_context.states.set_states(filtered) + super(LocalFrontierManager, self).links_extracted_after(request, filtered) + self.states_context.states.update_cache(filtered) def request_error(self, request, error): """ @@ -544,15 +576,11 @@ def request_error(self, request, error): :return: None. """ self._check_startstop() - self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) - self._states_context.to_fetch(request) - self._states_context.fetch() - self._states_context.states.set_states(request) - processed_page = self._process_components(method_name='request_error', - obj=request, - return_classes=self.request_model, - error=error) - self._states_context.states.update_cache(request) + self.states_context.to_fetch(request) + self.states_context.fetch() + self.states_context.states.set_states(request) + processed_page = super(LocalFrontierManager, self).request_error(request, error) + self.states_context.states.update_cache(request) return processed_page def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): @@ -569,16 +597,16 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No """ r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) return self._process_components('create_request', - obj=r, - return_classes=self.request_model, - components=(0,1)) + obj=r, + return_classes=self.request_model, + components=(0,1)) def _check_startstop(self): assert self._started, "Frontier not started!" assert not self._stopped, "Call to stopped frontier!" -class WorkerFrontierManager(BaseManager, ComponentsPipelineMixin): +class WorkerFrontierManager(BaseContext, StrategyComponentsPipelineMixin): """ The :class:`WorkerFrontierManager ` class role is to instantiate the core components and is used mainly by workers. @@ -615,14 +643,13 @@ def __init__(self, settings, request_model, response_model, backend, strategy_cl :param bool strategy_worker: True if class is instantiated in strategy worker environment """ - BaseManager.__init__(self, request_model, response_model, settings=settings) + BaseContext.__init__(self, request_model, response_model, settings=settings) self._max_next_requests = max_next_requests - ComponentsPipelineMixin.__init__(self, backend=backend, strategy_class=strategy_class, - strategy_args=strategy_args, scoring_stream=scoring_stream, - middlewares=middlewares, canonicalsolver=canonicalsolver, - db_worker=db_worker,strategy_worker=strategy_worker) + StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, scoring_stream, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=db_worker,strategy_worker=strategy_worker) # Init frontier components pipeline # Some code relies on the order, modify carefully @@ -639,17 +666,17 @@ def __init__(self, settings, request_model, response_model, backend, strategy_cl def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): manager_settings = Settings.object_from(settings) return WorkerFrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - strategy_class=manager_settings.STRATEGY, - strategy_args=manager_settings.STRATEGY_ARGS, - middlewares=manager_settings.MIDDLEWARES, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker, - scoring_stream=scoring_stream) + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, + middlewares=manager_settings.MIDDLEWARES, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER, + db_worker=db_worker, + strategy_worker=strategy_worker, + scoring_stream=scoring_stream) def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ @@ -670,25 +697,33 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No components=(0, 1)) -class SpiderFrontierManager(LocalFrontierManager): - def __init__(self, *args, **kwargs): - super(SpiderFrontierManager, self).__init__(*args, **kwargs) +class SpiderFrontierManager(BaseContext, ComponentsPipelineMixin, BaseManager): + def __init__(self, request_model, response_model, backend, middlewares, max_next_requests, settings, + canonicalsolver): + BaseContext.__init__(self, request_model, response_model, settings=settings) + ComponentsPipelineMixin.__init__(self, backend, middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=False, strategy_worker=False) + + self._components_pipeline = [ + ('Middleware', self.middlewares, True), + ('CanonicalSolver', self.canonicalsolver, False), + ('Backend', self.backend, False) + ] @classmethod - def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): + def from_settings(cls, settings=None): manager_settings = Settings.object_from(settings) return SpiderFrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - strategy_class=manager_settings.STRATEGY, - strategy_args=manager_settings.STRATEGY_ARGS, - middlewares=manager_settings.MIDDLEWARES, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker, - scoring_stream=scoring_stream) + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + middlewares=manager_settings.MIDDLEWARES, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER) + + def links_extracted(self, request, links): + super(SpiderFrontierManager, self).links_extracted(request, links) + super(SpiderFrontierManager, self).links_extracted_after(request, links) @six.add_metaclass(ABCMeta) diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 9ee988e3c..d8d3a347c 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -2,7 +2,6 @@ from __future__ import absolute_import import os -import time import logging import threading from traceback import format_stack @@ -19,7 +18,7 @@ from frontera.utils.misc import load_object from frontera.logger.handlers import CONSOLE from frontera.exceptions import NotConfigured -from frontera.core.manager import FrontierManager +from frontera.core.manager import WorkerFrontierManager from frontera.worker.server import WorkerJsonRpcService from frontera.utils.ossignal import install_shutdown_handlers from frontera.worker.stats import StatsExportMixin @@ -100,7 +99,7 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): messagebus = load_object(settings.get('MESSAGE_BUS')) self.message_bus = messagebus(settings) - self._manager = FrontierManager.from_settings(settings, db_worker=True) + self._manager = WorkerFrontierManager.from_settings(settings, db_worker=True) self.backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 1274a468a..88e8b718a 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -18,7 +18,7 @@ from twisted.internet.defer import Deferred from twisted.internet.task import LoopingCall -from frontera.core.manager import WorkerFrontierManager, MessageBusUpdateScoreStream, StatesContext +from frontera.core.manager import WorkerFrontierManager, MessageBusUpdateScoreStream from frontera.logger.handlers import CONSOLE from frontera.settings import Settings from frontera.utils.misc import load_object From ec4747578077d263e85a95f87f243b2ea77301cd Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 28 May 2018 10:24:22 +0200 Subject: [PATCH 176/273] import, frontier_manager and strategy fixes --- .gitignore | 1 + frontera/contrib/middlewares/domain.py | 4 +- frontera/core/components.py | 9 --- frontera/core/manager.py | 10 +-- frontera/settings/default_settings.py | 1 - tests/backends.py | 9 ++- .../backends/redis_backend/test_redis.py | 6 +- tests/mocks/components.py | 58 ++++++++++++--- tests/test_domain_mware.py | 1 - tests/test_frontier_manager.py | 71 ++++++++++++------- tests/test_strategy.py | 12 ++-- 11 files changed, 118 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index 287b0b569..391d32d6e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ var/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index 7ea665a3a..8f81af184 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -94,9 +94,9 @@ def create_request(self, request): return self._add_domain(request) def _add_domain(self, obj): - obj.meta[b'domain'] = self._parse_domain_info(obj.url, self.manager.test_mode) + obj.meta[b'domain'] = self._parse_domain_info(obj.url) if b'redirect_urls' in obj.meta: - obj.meta[b'redirect_domains'] = [self._parse_domain_info(url, self.manager.test_mode) + obj.meta[b'redirect_domains'] = [self._parse_domain_info(url) for url in obj.meta[b'redirect_urls']] return obj diff --git a/frontera/core/components.py b/frontera/core/components.py index b9746b58e..76abf81cf 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -22,15 +22,6 @@ class Metadata(StartStopMixin): """Interface definition for a frontier metadata class. This class is responsible for storing documents metadata, including content and optimized for write-only data flow.""" - @abstractmethod - def add_seeds(self, seeds): - """ - This method is called when new seeds are added to the frontier. - - :param list seeds: A list of :class:`Request ` objects. - """ - pass - @abstractmethod def page_crawled(self, response): """ diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 2d45fdcce..389f69986 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -478,6 +478,7 @@ def stop(self): self._check_startstop() self._logger.debug('STOP') self._process_components(method_name='frontier_stop') + StrategyComponentsPipelineMixin.close(self) self._stopped = True def add_seeds(self, seeds_file): @@ -596,10 +597,11 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No :return: :class:`Request ` object """ r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) - return self._process_components('create_request', - obj=r, - return_classes=self.request_model, - components=(0,1)) + self._process_components('create_request', + obj=r, + return_classes=self.request_model, + components=(0,1)) + return r def _check_startstop(self): assert self._started, "Frontier not started!" diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 28fbe8cee..6427573e7 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -67,7 +67,6 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False -STRATEGY = "frontera.worker.strategies.bfs.CrawlingStrategy" STRATEGY_ARGS = {} SW_FLUSH_INTERVAL = 300 TEST_MODE = False diff --git a/tests/backends.py b/tests/backends.py index f3cdab956..ace58ce4e 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -1,7 +1,9 @@ from __future__ import absolute_import import pytest -from frontera import FrontierManager, Settings, FrontierTester +from frontera.core.manager import LocalFrontierManager +from frontera.worker.strategies.bfs import CrawlingStrategy +from frontera import Settings, FrontierTester from frontera.utils import graphs from frontera.utils.tester import BaseDownloaderSimulator @@ -38,14 +40,15 @@ def get_frontier(self): """ Returns frontierManager object """ - return FrontierManager.from_settings(self.get_settings()) + return LocalFrontierManager.from_settings(self.get_settings()) def get_settings(self): """ Returns backend settings """ return Settings(attributes={ - 'BACKEND': self.backend_class + 'BACKEND': self.backend_class, + 'STRATEGY': 'frontera.worker.strategies.bfs.CrawlingStrategy' }) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index adbc6ceb4..95bdbc73b 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -3,7 +3,7 @@ from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL from frontera.contrib.backends.redis_backend import RedisBackend, RedisMetadata, RedisQueue, RedisState -from frontera.core.manager import FrontierManager +from frontera.core.manager import BaseContext from frontera.settings import Settings from redis import ConnectionPool, StrictRedis from time import time @@ -39,7 +39,7 @@ class RedisQueueTest(TestCase): @staticmethod def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') - return RedisQueue(FrontierManager.from_settings(settings), get_pool(), partitions, True) + return RedisQueue(BaseContext.from_settings(settings), get_pool(), partitions, True) def test_scheduling_past_1part_5(self): subject = self.setup_subject(1) @@ -283,7 +283,7 @@ def test_get_next_requests_min_hosts_high_number(self): self.assertTrue('https://www.khellan.com/' in urls) self.assertEqual(1, subject.count()) - def test_get_next_requests_max_requests(self): + def test_get_next_requests_max_requests_2(self): subject = self.setup_subject(2) batch = [ ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), diff --git a/tests/mocks/components.py b/tests/mocks/components.py index 801257f10..aa6c76b69 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -1,18 +1,21 @@ from __future__ import absolute_import from frontera.core.components import Backend, Middleware, CanonicalSolver, \ DistributedBackend, Queue +from frontera.contrib.backends.memory import MemoryStates from six.moves import range from frontera.core.models import Request +from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.core.components import States class FakeMiddleware(Middleware): def __init__(self): - self.seeds = [] + self.requests = [] self.responses = [] self.links = [] self.errors = [] - self.lists = [self.seeds, self.responses, self.links, self.errors] + self.lists = [self.requests, self.responses, self.links, self.errors] self._started = False self._stopped = False self.test_value = 'test' @@ -27,10 +30,9 @@ def frontier_start(self): def frontier_stop(self): self._stopped = True - def add_seeds(self, seeds): - for seed in seeds: - self.seeds.append(seed) - return seeds + def create_request(self, request): + self.requests.append(request) + return request def page_crawled(self, response): self.responses.append(response) @@ -73,8 +75,19 @@ def schedule(self, batch): class FakeBackend(FakeMiddleware, Backend): - _finished = False - queue = FakeQueue() + def __init__(self): + self._finished = False + self._queue = FakeQueue() + self._states = MemoryStates(10000) + super(FakeBackend, self).__init__() + + @property + def queue(self): + return self._queue + + @property + def states(self): + return self._states def finished(self): return self._finished @@ -127,6 +140,9 @@ def links_extracted(self, request, links): def request_error(self, request, error): self.errors.append((request, error)) + def create_request(self, request): + self.requests.append(request) + class FakeMiddlewareModifySeeds(FakeMiddleware): @@ -179,3 +195,29 @@ def links_extracted(self, request, links): self.links.append(link) link.meta[b'test_links_canonical_solver'] = self.test_value return request + + +class CrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, fh): + for url in fh: + url = url.strip() + req = self.create_request(url) + self.refresh_states(req) + if req.meta[b'state'] is States.NOT_CRAWLED: + req.meta[b'state'] = States.QUEUED + self.schedule(req) + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] is States.NOT_CRAWLED: + link.meta[b'state'] = States.QUEUED + self.schedule(link, 0.5) + + def page_error(self, request, error): + request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/tests/test_domain_mware.py b/tests/test_domain_mware.py index ecf06169f..a18462b4d 100644 --- a/tests/test_domain_mware.py +++ b/tests/test_domain_mware.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import unittest from frontera.contrib.middlewares.domain import DomainMiddleware -from frontera.core.manager import FrontierManager from frontera.core.models import Request diff --git a/tests/test_frontier_manager.py b/tests/test_frontier_manager.py index 806154587..233fc842d 100644 --- a/tests/test_frontier_manager.py +++ b/tests/test_frontier_manager.py @@ -1,32 +1,45 @@ from __future__ import absolute_import -from frontera.core.manager import FrontierManager +from frontera.core.manager import LocalFrontierManager from frontera.settings import Settings from frontera.core.models import Request, Response +from frontera.core.components import States from six.moves import range +from unittest import TestCase -r1 = Request('http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) +r1 = Request('http://www.example.com', meta={b'fingerprint': b'89e6a0649e06d83370cdf2cbfb05f363934a8d0c'}) r2 = Request('https://www.example.com/some/page', meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'}) -r3 = Request('http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) +r3 = Request('http://example1.com', meta={b'fingerprint': b'758293d800fc9672ae2c68bd083359b74ab9b6c2'}) +seeds_blob = b"""http://www.example.com +https://www.example.com/some/page +http://example1.com +""" +from io import BytesIO -class TestFrontierManager(object): +SEEDS_FILE = BytesIO(seeds_blob) + + +class TestFrontierManager(TestCase): def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', + settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' - return FrontierManager.from_settings(settings) + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + return LocalFrontierManager.from_settings(settings) def test_start(self): fm = self.setup_frontier_manager() assert fm._started is True assert fm.backend._started is True - assert [mw._started for mw in fm.middlewares] == [True]*4 + assert [mw._started for mw in fm.middlewares[-4:]] == [True]*4 assert fm.canonicalsolver._started is True def test_stop(self): @@ -34,7 +47,7 @@ def test_stop(self): fm.stop() assert fm._stopped is True assert fm.backend._stopped is True - assert [mw._stopped for mw in fm.middlewares] == [True]*4 + assert [mw._stopped for mw in fm.middlewares[-4:]] == [True]*4 assert fm.canonicalsolver._stopped is True def test_properties(self): @@ -48,24 +61,24 @@ def test_properties(self): def test_add_seeds(self): fm = self.setup_frontier_manager() - fm.add_seeds([r1, r2, r3]) + SEEDS_FILE.seek(0) + fm.add_seeds(SEEDS_FILE) + + fprints_set = set([r.meta[b'fingerprint'] for r in [r1, r2, r3]]) #seeds reached backend. - assert set([seed for seed in fm.backend.seeds]) == set([r1, r2, r3]) + assert set([r.meta[b'fingerprint'] for r in fm.backend.queue.requests]) == fprints_set #seeds reached canonicalsolver - assert set([seed for seed in fm.canonicalsolver.seeds]) == set([r1, r2, r3]) + assert set([r.meta[b'fingerprint'] for r in fm.canonicalsolver.requests]) == fprints_set #seeds reached the 4 middlewares. - assert [set([seed for seed in mw.seeds]) for mw in fm.middlewares] == [set([r1, r2, r3])]*4 - #seeds were modified. - assert [seed.meta[b'test_seeds'] for seed in [r1, r2, r3]] == ['test']*3 - assert [seed.meta[b'test_seeds_canonical_solver'] for seed in [r1, r2, r3]] == ['test']*3 + assert [set([r.meta[b'fingerprint'] for r in mw.requests]) for mw in fm.middlewares[-4:]] == [fprints_set]*4 def test_page_crawled(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.page_crawled(response) - assert fm.backend.responses.pop() == response - assert [mw.responses.pop() for mw in fm.middlewares] == [response]*4 + assert response.meta[b'state'] == States.CRAWLED + assert [mw.responses.pop() for mw in fm.middlewares[-4:]] == [response]*4 assert fm.canonicalsolver.responses.pop() == response assert response.meta[b'test_response'] == 'test' @@ -73,9 +86,9 @@ def test_links_extracted(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.links_extracted(r1, links=[r2, r3]) - assert set([link for link in fm.backend.links]) == set([r2, r3]) + assert set([link.meta[b'fingerprint'] for link in fm.backend.queue.requests]) == set([r.meta[b'fingerprint'] for r in [r2, r3]]) assert set([link for link in fm.canonicalsolver.links]) == set([r2, r3]) - assert [set([link for link in mw.links]) for mw in fm.middlewares] == [set([r2, r3])]*4 + assert [set([link for link in mw.links]) for mw in fm.middlewares[-4:]] == [set([r2, r3])]*4 assert [link.meta[b'test_links'] for link in [r2, r3]] == ['test']*2 assert [link.meta[b'test_links_canonical_solver'] for link in [r2, r3]] == ['test']*2 @@ -89,8 +102,8 @@ def test_get_next_requests(self): def test_request_error(self): fm = self.setup_frontier_manager() fm.request_error(r1, 'error') - assert fm.backend.errors.pop() == (r1, 'error') - assert [mw.errors.pop() for mw in fm.middlewares] == [(r1, 'error')]*4 + assert r1.meta[b'state'] == States.ERROR + assert [mw.errors.pop() for mw in fm.middlewares[-4:]] == [(r1, 'error')]*4 assert fm.canonicalsolver.errors.pop() == (r1, 'error') def test_max_requests_reached(self): @@ -106,14 +119,18 @@ def test_max_requests_reached(self): def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', + settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' - fm = FrontierManager.from_settings(settings) - fm.add_seeds([r1, r2, r3]) + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + fm = LocalFrontierManager.from_settings(settings) + SEEDS_FILE.seek(0) + fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) @@ -122,9 +139,9 @@ def test_blocking_middleware(self): #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. - assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 + assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3 #the error, response and link reached the first three middlewares. - assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 + assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. - assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 + assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4 diff --git a/tests/test_strategy.py b/tests/test_strategy.py index f1b58b2f5..aa4f6201b 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,15 +1,14 @@ # -*- coding: utf-8 -*- from frontera.worker.strategies import BaseCrawlingStrategy -from frontera.worker.strategy import StatesContext from frontera.settings import Settings -from frontera.core.manager import FrontierManager, StatesContext +from frontera.core.manager import WorkerFrontierManager, StatesContext from frontera.contrib.backends.memory import MemoryStates from frontera.core.components import States class TestingCrawlingStrategy(BaseCrawlingStrategy): - def add_seeds(self, seeds): + def read_seeds(self, seeds_file): pass def page_crawled(self, response): @@ -37,11 +36,12 @@ class TestCrawlingStrategy(object): def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' - manager = FrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) + settings.STRATEGY = 'tests.test_strategy.TestingCrawlingStrategy' + manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) - return TestingCrawlingStrategy.from_worker(manager, None, stream, states_ctx) + return manager.strategy def test_create_request(self): s = self.strategy() @@ -50,7 +50,7 @@ def test_create_request(self): def test_states_refresh(self): s = self.strategy() - states = s._states_context._states + states = s._states_context.states url = "http://test.com/someurl" req1 = s.create_request(url) req1.meta[b'state'] = States.CRAWLED From d6c2e5d1f3204fed592563d18183801cde48eab1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 28 May 2018 14:21:56 +0200 Subject: [PATCH 177/273] SW test case fixes --- frontera/worker/stats.py | 2 +- frontera/worker/strategy.py | 31 ++++++++++++++++++------------- tests/test_worker_strategy.py | 31 ++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/frontera/worker/stats.py b/frontera/worker/stats.py index c99128ac9..e3d7c7a76 100644 --- a/frontera/worker/stats.py +++ b/frontera/worker/stats.py @@ -92,7 +92,7 @@ def get_stats(self): stats = {stats_key: self.stats[stats_key] for stats_key in self.stats if stats_key.split('_', 1)[0] in self.STATS_PREFIXES} - stats.update(self._manager.backend.get_stats() or {}) + stats.update(self.backend.get_stats() or {}) if not stats: return stats['_timestamp'] = utc_timestamp() diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 88e8b718a..96968f91c 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -36,6 +36,7 @@ def __init__(self, manager, scoring_stream, stats, job_id): self.scoring_stream = scoring_stream self.stats = stats self.job_id = job_id + self.manager = manager self._batch = [] @@ -152,16 +153,21 @@ def __init__(self, settings, is_add_seeds_mode): self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() - manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') - encoder_cls = load_object(codec_path+".Encoder") - decoder_cls = load_object(codec_path+".Decoder") - self._decoder = decoder_cls(manager.request_model, manager.response_model) - self._encoder = encoder_cls(manager.request_model) + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") + + request_model = load_object(settings.get('REQUEST_MODEL')) + response_model = load_object(settings.get('RESPONSE_MODEL')) + self._decoder = decoder_cls(request_model, response_model) + self._encoder = encoder_cls(request_model) self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder) + manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True, scoring_stream=self.update_score) + self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.stats = defaultdict(int) + self.backend = manager.backend self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) @@ -186,7 +192,7 @@ def work(self): self.workflow.process() # Exiting, if crawl is finished - if self.strategy.finished(): + if self.workflow.strategy.finished(): logger.info("Successfully reached the crawling goal.") logger.info("Finishing.") d = self.stop_tasks() @@ -198,8 +204,9 @@ def work(self): def add_seeds(self, seeds_url): logger.info("Seeds addition started from url %s", seeds_url) + strategy = self.workflow.strategy if not seeds_url: - self.strategy.read_seeds(None) + strategy.read_seeds(None) else: parsed = urlparse(seeds_url) if parsed.scheme == "s3": @@ -214,14 +221,14 @@ def add_seeds(self, seeds_url): fh = open(parsed.path, "rb") else: raise TypeError("Unsupported URL scheme") - self.strategy.read_seeds(fh) + strategy.read_seeds(fh) try: fh.close() except: logger.exception("Error during closing of seeds stream") pass self.update_score.flush() - self.states_context.release() + self.workflow.states_context.release() def run(self, seeds_url): def log_failure(failure): @@ -266,7 +273,7 @@ def log_status(self): logger.info("%s=%s", k, v) def flush_states(self): - self.states_context.flush() + self.workflow.states_context.flush() def _handle_shutdown(self, signum, _): def call_shutdown(): @@ -300,10 +307,8 @@ def _stop_reactor(self, _=None): def _perform_shutdown(self, _=None): try: self.flush_states() - logger.info("Closing crawling strategy.") - self.strategy.close() logger.info("Stopping frontier manager.") - self._manager.stop() + self.workflow.manager.close() logger.info("Closing message bus.") self.scoring_log_producer.close() if not self.add_seeds_mode: diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 1a00e218e..b35a160f8 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -1,8 +1,8 @@ from frontera.worker.strategy import StrategyWorker -from frontera.worker.strategies.bfs import CrawlingStrategy from frontera.settings import Settings from frontera.core.models import Request, Response from frontera.core.components import States +from tests.mocks.components import CrawlingStrategy from unittest import TestCase from os import remove from os.path import exists @@ -23,8 +23,9 @@ def setUp(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - self.sw = StrategyWorker(settings, CrawlingStrategy, None, None) + self.sw = StrategyWorker(settings, False) def tearDown(self): if exists("/tmp/test_urls.txt"): @@ -35,15 +36,17 @@ def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - return StrategyWorker(settings, FilteredLinksCrawlingStrategy, None, None) + return StrategyWorker(settings, False) def sw_setup_add_seeds(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - return StrategyWorker(settings, CrawlingStrategy, None, True) + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + return StrategyWorker(settings, True) def test_add_seeds(self): sw = self.sw_setup_add_seeds() @@ -70,11 +73,11 @@ def test_page_crawled(self): sw.work() # response should be skipped if it's jid doesn't match the strategy worker's assert sw.scoring_log_producer.messages == [] - sw.job_id = 1 + sw.workflow.job_id = 1 sw.consumer.put_messages([msg]) sw.work() r1c = r1.copy() - sw.states.set_states(r1c) + sw.workflow.states_context.states.set_states(r1c) assert r1c.meta[b'state'] == States.CRAWLED def test_links_extracted(self): @@ -86,8 +89,14 @@ def test_links_extracted(self): sw.work() r3.meta[b'state'] = States.QUEUED r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set(sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) for r in [r3, r4]) + + # decoding messages from scoring log + fprints = set() + for msg in sw.scoring_log_producer.messages: + typ, req, score, is_schedule = sw._decoder.decode(msg) + fprints.add(req.meta[b'fingerprint']) + + assert fprints == set([r.meta[b'fingerprint'] for r in [r3, r4]]) def test_filter_links_extracted(self): sw = self.sw_setup_filtered_links() @@ -105,6 +114,6 @@ def test_request_error(self): msg = sw._encoder.encode_request_error(r4, 'error') sw.consumer.put_messages([msg]) sw.work() - r4.meta[b'state'] = States.ERROR - assert sw.scoring_log_producer.messages.pop() == \ - sw._encoder.encode_update_score(r4, 0.0, False) + sw.workflow.states_context.states.set_states(r4) + + assert r4.meta[b'state'] == States.ERROR \ No newline at end of file From 3affb123f3d46859984413ec268cf7116d887166 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 30 May 2018 16:25:43 +0200 Subject: [PATCH 178/273] tests fixed --- tests/test_frontera_scheduler.py | 45 +++++++++----------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index 3b6e1ed04..197e7f326 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -30,17 +30,6 @@ class TestFronteraScheduler(object): - def test_enqueue_requests(self): - crawler = FakeCrawler() - fs = FronteraScheduler(crawler, manager=FakeFrontierManager) - fs.open(Spider) - assert fs.enqueue_request(r1) is True - assert fs.enqueue_request(r2) is True - assert fs.enqueue_request(r3) is True - assert set(seed.url for seed in fs.frontier.manager.seeds) == set([r1.url, r2.url, r3.url]) - assert all([isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 3 - def test_redirect_disabled_enqueue_requests(self): settings = Settings() settings['REDIRECT_ENABLED'] = False @@ -49,11 +38,7 @@ def test_redirect_disabled_enqueue_requests(self): fs.open(Spider) assert fs.enqueue_request(rr1) is False assert fs.enqueue_request(rr2) is False - assert fs.enqueue_request(rr3) is True - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert len(fs.frontier.manager.seeds) == 1 - assert fs.frontier.manager.seeds[0].url == rr3.url - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 + assert fs.enqueue_request(rr3) is False def test_redirect_enabled_enqueue_requests(self): settings = Settings() @@ -64,13 +49,7 @@ def test_redirect_enabled_enqueue_requests(self): assert fs.enqueue_request(rr1) is True assert fs.enqueue_request(rr2) is True assert fs.enqueue_request(rr3) is True - assert len(fs.frontier.manager.seeds) == 1 - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert fs.frontier.manager.seeds[0].url == rr3.url - assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url]) - assert all([isinstance(request, Request) for request in fs._pending_requests]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 - assert fs.stats_manager.stats.get_value('frontera/redirected_requests_count') == 2 + assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url, rr3.url]) def test_next_request(self): crawler = FakeCrawler() @@ -113,19 +92,19 @@ def test_next_request_overused_keys_info(self): def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} - no_requests = 3 - result = [r1, r2, r3, i1, i2] + items = [i1 , i2] + requests = [r1, r2, r3] + result = list(requests) + result.extend(items) resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) - fs.open(Spider) - out = list(fs.process_spider_output(resp, result, Spider)) - assert len(out) == len(result) - out_request = out[:no_requests] - assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests]) - out_items = out[no_requests:] - assert sorted(out_items, key=lambda i: sorted(i['item'])) == \ - sorted([i1, i2], key=lambda i: sorted(i['item'])) + spider = Spider(name="testing") + fs.open(spider) + out_items = list(fs.process_spider_output(resp, result, spider)) + assert len(out_items) == len(items) + assert set([r.url for r in fs.frontier.manager.links]) == set([r.url for r in requests]) + assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) From 32069febcccafedb85607d1597301415e5610d75 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 30 May 2018 17:08:21 +0200 Subject: [PATCH 179/273] tests fixed --- frontera/core/manager.py | 64 ++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 389f69986..c005d56ff 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -80,9 +80,10 @@ def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=Fa # Load canonical solver self._logger_components.debug("Loading canonical url solver '%s'", canonicalsolver) - self._canonicalsolver = self._load_object(canonicalsolver) - assert isinstance(self.canonicalsolver, CanonicalSolver), \ - "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ + if canonicalsolver: + self._canonicalsolver = self._load_object(canonicalsolver) + assert isinstance(self.canonicalsolver, CanonicalSolver), \ + "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ BackendMixin.__init__(self, backend, db_worker, strategy_worker) @property @@ -613,8 +614,8 @@ class WorkerFrontierManager(BaseContext, StrategyComponentsPipelineMixin): The :class:`WorkerFrontierManager ` class role is to instantiate the core components and is used mainly by workers. """ - def __init__(self, settings, request_model, response_model, backend, strategy_class, strategy_args, - max_next_requests, scoring_stream, middlewares=None, canonicalsolver=None, db_worker=False, + def __init__(self, settings, request_model, response_model, backend, max_next_requests, strategy_class=None, + strategy_args=None, scoring_stream=None, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): """ :param object/string request_model: The :class:`Request ` object to be \ @@ -648,17 +649,18 @@ def __init__(self, settings, request_model, response_model, backend, strategy_cl BaseContext.__init__(self, request_model, response_model, settings=settings) self._max_next_requests = max_next_requests - - StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, scoring_stream, - middlewares=middlewares, canonicalsolver=canonicalsolver, - db_worker=db_worker,strategy_worker=strategy_worker) - - # Init frontier components pipeline - # Some code relies on the order, modify carefully - self._components_pipeline = [ - ('Middleware', self.middlewares, True), - ('CanonicalSolver', self.canonicalsolver, False), - ] + if strategy_worker: + StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, scoring_stream, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=db_worker,strategy_worker=strategy_worker) + # Init frontier components pipeline + # Some code relies on the order, modify carefully + self._components_pipeline = [ + ('Middleware', self.middlewares, True), + ('CanonicalSolver', self.canonicalsolver, False), + ] + if db_worker: + ComponentsPipelineMixin.__init__(self, backend, db_worker=db_worker,strategy_worker=strategy_worker) # Log frontier manager start self._logger.info('Frontier Manager Started!') @@ -667,18 +669,24 @@ def __init__(self, settings, request_model, response_model, backend, strategy_cl @classmethod def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): manager_settings = Settings.object_from(settings) - return WorkerFrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - strategy_class=manager_settings.STRATEGY, - strategy_args=manager_settings.STRATEGY_ARGS, - middlewares=manager_settings.MIDDLEWARES, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker, - scoring_stream=scoring_stream) + kwargs = { + 'request_model': manager_settings.REQUEST_MODEL, + 'response_model': manager_settings.RESPONSE_MODEL, + 'backend' : manager_settings.BACKEND, + 'max_next_requests': manager_settings.MAX_NEXT_REQUESTS, + 'settings': manager_settings, + 'db_worker': db_worker, + 'strategy_worker': strategy_worker + } + if strategy_worker: + kwargs.update({ + 'strategy_class': manager_settings.STRATEGY, + 'strategy_args': manager_settings.STRATEGY_ARGS, + 'middlewares': manager_settings.MIDDLEWARES, + 'canonicalsolver': manager_settings.CANONICAL_SOLVER, + 'scoring_stream': scoring_stream + }) + return WorkerFrontierManager(**kwargs) def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ From 214d34558ac46f1b438638398e908c8936286a2b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 30 May 2018 19:24:59 +0200 Subject: [PATCH 180/273] backend tests fixes --- .../contrib/backends/sqlalchemy/revisiting.py | 4 + frontera/utils/graphs/data.py | 6 +- frontera/utils/tester.py | 21 +- tests/backends.py | 444 ++++++++++-------- .../sqlalchemy/test_backend_sqlalchemy.py | 10 +- 5 files changed, 281 insertions(+), 204 deletions(-) diff --git a/frontera/contrib/backends/sqlalchemy/revisiting.py b/frontera/contrib/backends/sqlalchemy/revisiting.py index b2b574715..f079756d5 100644 --- a/frontera/contrib/backends/sqlalchemy/revisiting.py +++ b/frontera/contrib/backends/sqlalchemy/revisiting.py @@ -104,6 +104,10 @@ def count(self): class Backend(SQLAlchemyBackend): + """ + DEPRECATED, and will be removed in the next versions. Revisiting is meant to be implemented as part of + crawling strategy. + """ def _create_queue(self, settings): self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") diff --git a/frontera/utils/graphs/data.py b/frontera/utils/graphs/data.py index f925cdf99..3822b597d 100644 --- a/frontera/utils/graphs/data.py +++ b/frontera/utils/graphs/data.py @@ -65,17 +65,17 @@ def __len__(self): SITE_A = CrawlSiteData( name='A', description='', - pages=create_test_site('A', 4, 2)) + pages=create_test_site('http://aaa.com/', 4, 2)) SITE_B = CrawlSiteData( name='B', description='', - pages=create_test_site('B', 4, 2)) + pages=create_test_site('http://bbb.com/', 4, 2)) SITE_C = CrawlSiteData( name='C', description='', - pages=create_test_site('C', 5, 2, self_link=True)) + pages=create_test_site('http://ccc.com/', 5, 2, self_link=True)) #----------------------------------------------------- diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 29956406f..b8f979ced 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -1,9 +1,11 @@ -from __future__ import absolute_import +from __future__ import absolute_import, print_function from collections import OrderedDict, deque from six.moves.urllib.parse import urlparse import six from six.moves import range +from io import BytesIO +from os import linesep class FrontierTester(object): @@ -31,15 +33,24 @@ def run(self, add_all_pages=False): self.frontier.stop() def _add_seeds(self): - self.frontier.add_seeds([self._make_request(seed.url) for seed in self.graph_manager.seeds]) + stream = BytesIO() + for seed in self.graph_manager.seeds: + stream.write(seed.url.encode('utf8')) + stream.write(linesep.encode('utf8')) + stream.seek(0) + self.frontier.add_seeds(stream) def _add_all(self): + stream = BytesIO() for page in self.graph_manager.pages: - if page.is_seed: - self.frontier.add_seeds([self._make_request(page.url)]) + stream.write(page.url.encode('utf8')) if not page.has_errors: for link in page.links: - self.frontier.add_seeds([self._make_request(link.url)]) + stream.write(link.url.encode('utf8')) + stream.write(linesep.encode('utf8')) + stream.seek(0) + + self.frontier.add_seeds(stream) def _make_request(self, url): r = self.frontier.request_model(url=url, diff --git a/tests/backends.py b/tests/backends.py index ace58ce4e..41180778c 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -1,13 +1,71 @@ from __future__ import absolute_import import pytest +from frontera.core.components import States from frontera.core.manager import LocalFrontierManager -from frontera.worker.strategies.bfs import CrawlingStrategy +from frontera.worker.strategies import BaseCrawlingStrategy from frontera import Settings, FrontierTester from frontera.utils import graphs from frontera.utils.tester import BaseDownloaderSimulator +class BasicCrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self._create_request(url) + self.schedule(r, 1.0) + + def _create_request(self, url): + return self.create_request(url=url, + headers={ + b'X-Important-Header': b'Frontera' + }, + method=b'POST', + cookies={b'currency': b'USD'}, + meta={b'this_param': b'should be passed over'}) + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + self.schedule(self._create_request(link.url)) + link.meta[b'state'] = States.QUEUED + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def page_error(self, request, error): + request.meta[b'state'] = States.ERROR + + +class DFSCrawlingStrategy(BasicCrawlingStrategy): + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self._create_request(url) + r.meta[b'depth'] = 0 + self.schedule(r, self._get_score(r.meta[b'depth'])) + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + r = self._create_request(link.url) + r.meta[b'depth'] = request.meta[b'depth'] + 1 + self.schedule(r, self._get_score(r.meta[b'depth'])) + link.meta[b'state'] = States.QUEUED + + def _get_score(self, depth): + return 1.0 / (depth + 1.0) + + +class BFSCrawlingStrategy(DFSCrawlingStrategy): + def _get_score(self, depth): + return float(depth) / 10.0 + + class BackendTest(object): """ A simple pytest base class with helper methods for @@ -48,7 +106,7 @@ def get_settings(self): """ return Settings(attributes={ 'BACKEND': self.backend_class, - 'STRATEGY': 'frontera.worker.strategies.bfs.CrawlingStrategy' + 'STRATEGY': 'tests.backends.BasicCrawlingStrategy' }) @@ -118,7 +176,7 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): # Get sequence sequence = self.get_url_sequence(site_list, max_next_requests) - #print [str(n) for n in sequence] + #print ([str(n) for n in sequence]) # Assert sequence equals expected assert len(sequence) == len(expected_sequence) @@ -126,31 +184,30 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): class FIFOBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' - ], + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' + ] + , "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } + @pytest.mark.parametrize( ('site_list', 'max_next_requests', 'expected_sequence'), [ @@ -185,97 +242,97 @@ class LIFOBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A12', - 'A122', 'A1222', 'A1221', - 'A121', 'A1212', 'A1211', - 'A11', - 'A112', 'A1122', 'A1121', - 'A111', 'A1112', 'A1111' + 'http://aaa.com/1', + 'http://aaa.com/12', + 'http://aaa.com/122', 'http://aaa.com/1222', 'http://aaa.com/1221', + 'http://aaa.com/121', 'http://aaa.com/1212', 'http://aaa.com/1211', + 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/111', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_01_B": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221'], + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', + 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221'], "SEQUENCE_01_C": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121' + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121' ], "SEQUENCE_02_A": [ - 'B1', - 'B12', 'B122', 'B1222', 'B1221', 'B121', 'B1212', 'B1211', - 'B11', 'B112', 'B1122', 'B1121', 'B111', 'B1112', 'B1111', - 'A1', - 'A12', 'A122', 'A1222', 'A1221', 'A121', 'A1212', 'A1211', - 'A11', 'A112', 'A1122', 'A1121', 'A111', 'A1112', 'A1111' + 'http://bbb.com/1', + 'http://bbb.com/12', 'http://bbb.com/122', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/121', 'http://bbb.com/1212', 'http://bbb.com/1211', + 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/1122', 'http://bbb.com/1121', 'http://bbb.com/111', 'http://bbb.com/1112', 'http://bbb.com/1111', + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/122', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/121', 'http://aaa.com/1212', 'http://aaa.com/1211', + 'http://aaa.com/11', 'http://aaa.com/112', 'http://aaa.com/1122', 'http://aaa.com/1121', 'http://aaa.com/111', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_02_B": [ - 'B1', 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', - 'B12', 'B11', - 'B112', 'B111', - 'B1112', 'B1111', 'B1122', 'B1121', - 'B122', 'B121', - 'B1212', 'B1211', 'B1222', 'B1221' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', + 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', + 'http://bbb.com/12', 'http://bbb.com/11', + 'http://bbb.com/112', 'http://bbb.com/111', + 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121', + 'http://bbb.com/122', 'http://bbb.com/121', + 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221' ], "SEQUENCE_02_C": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', - 'A1122', 'A1121', 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121', - 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', 'http://bbb.com/12', 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/111', 'http://bbb.com/122', 'http://bbb.com/121', 'http://aaa.com/112', + 'http://aaa.com/1122', 'http://aaa.com/1121', 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121', + 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_02_D": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121', - 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', 'http://bbb.com/12', 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/111', 'http://bbb.com/122', 'http://bbb.com/121', 'http://aaa.com/112', 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121' ], "SEQUENCE_03_A": [ - 'C1', 'C12', 'C122', 'C1222', 'C12222', 'C12221', 'C1221', 'C12212', 'C12211', - 'C121', 'C1212', 'C12122', 'C12121', 'C1211', 'C12112', 'C12111', - 'C11', 'C112', 'C1122', 'C11222', 'C11221', 'C1121', 'C11212', 'C11211', - 'C111', 'C1112', 'C11122', 'C11121', 'C1111', 'C11112', 'C11111' + 'http://ccc.com/1', 'http://ccc.com/12', 'http://ccc.com/122', 'http://ccc.com/1222', 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/1221', 'http://ccc.com/12212', 'http://ccc.com/12211', + 'http://ccc.com/121', 'http://ccc.com/1212', 'http://ccc.com/12122', 'http://ccc.com/12121', 'http://ccc.com/1211', 'http://ccc.com/12112', 'http://ccc.com/12111', + 'http://ccc.com/11', 'http://ccc.com/112', 'http://ccc.com/1122', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/1121', 'http://ccc.com/11212', 'http://ccc.com/11211', + 'http://ccc.com/111', 'http://ccc.com/1112', 'http://ccc.com/11122', 'http://ccc.com/11121', 'http://ccc.com/1111', 'http://ccc.com/11112', 'http://ccc.com/11111' ], "SEQUENCE_03_B": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', - 'C1112', 'C1111', 'C11112', 'C11111', 'C11122', 'C11121', - 'C1122', 'C1121', 'C11212', 'C11211', 'C11222', 'C11221', - 'C122', 'C121', - 'C1212', 'C1211', 'C12112', 'C12111', 'C12122', 'C12121', - 'C1222', 'C1221', 'C12212', 'C12211', 'C12222', 'C12221' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', + 'http://ccc.com/1112', 'http://ccc.com/1111', 'http://ccc.com/11112', 'http://ccc.com/11111', 'http://ccc.com/11122', 'http://ccc.com/11121', + 'http://ccc.com/1122', 'http://ccc.com/1121', 'http://ccc.com/11212', 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', + 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/12112', 'http://ccc.com/12111', 'http://ccc.com/12122', 'http://ccc.com/12121', + 'http://ccc.com/1222', 'http://ccc.com/1221', 'http://ccc.com/12212', 'http://ccc.com/12211', 'http://ccc.com/12222', 'http://ccc.com/12221' ], "SEQUENCE_03_C": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', 'C1112', - 'C11122', 'C11121', 'C12212', 'C12211', - 'C12222', 'C12221', 'C12112', 'C12111', - 'C12122', 'C12121', - 'C1111', 'C1122', 'C1121', 'C11212', - 'C11211', 'C11222', 'C11221', 'C11112', 'C11111' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/1222', 'http://ccc.com/1221', 'http://ccc.com/1112', + 'http://ccc.com/11122', 'http://ccc.com/11121', 'http://ccc.com/12212', 'http://ccc.com/12211', + 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/12112', 'http://ccc.com/12111', + 'http://ccc.com/12122', 'http://ccc.com/12121', + 'http://ccc.com/1111', 'http://ccc.com/1122', 'http://ccc.com/1121', 'http://ccc.com/11212', + 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/11112', 'http://ccc.com/11111' ], "SEQUENCE_03_D": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', - 'C1112', 'C1111', 'C1122', 'C1121', - 'C11212', 'C11211', 'C11222', 'C11221', 'C11112', 'C11111', 'C11122', 'C11121', - 'C12212', 'C12211', 'C12222', 'C12221', 'C12112', 'C12111', 'C12122', 'C12121' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/1222', 'http://ccc.com/1221', + 'http://ccc.com/1112', 'http://ccc.com/1111', 'http://ccc.com/1122', 'http://ccc.com/1121', + 'http://ccc.com/11212', 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/11112', 'http://ccc.com/11111', 'http://ccc.com/11122', 'http://ccc.com/11121', + 'http://ccc.com/12212', 'http://ccc.com/12211', 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/12112', 'http://ccc.com/12111', 'http://ccc.com/12122', 'http://ccc.com/12121' ], } @@ -313,108 +370,108 @@ class DFSBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A111', 'A1111', 'A1112', 'A112', 'A1121', 'A1122', - 'A12', 'A121', 'A1211', 'A1212', 'A122', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/111', 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/12', 'http://aaa.com/121', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/122', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_01_B": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_01_C": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', - 'A11', - 'A111', 'A1111', 'A1112', - 'A112', 'A1121', 'A1122', - 'A12', - 'A121', 'A1211', 'A1212', - 'A122', 'A1221', 'A1222', - 'B1', - 'B11', - 'B111', 'B1111', 'B1112', - 'B112', 'B1121', 'B1122', - 'B12', - 'B121', 'B1211', 'B1212', - 'B122', 'B1221', 'B1222' + 'http://aaa.com/1', + 'http://aaa.com/11', + 'http://aaa.com/111', 'http://aaa.com/1111', 'http://aaa.com/1112', + 'http://aaa.com/112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/12', + 'http://aaa.com/121', 'http://aaa.com/1211', 'http://aaa.com/1212', + 'http://aaa.com/122', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1', + 'http://bbb.com/11', + 'http://bbb.com/111', 'http://bbb.com/1111', 'http://bbb.com/1112', + 'http://bbb.com/112', 'http://bbb.com/1121', 'http://bbb.com/1122', + 'http://bbb.com/12', + 'http://bbb.com/121', 'http://bbb.com/1211', 'http://bbb.com/1212', + 'http://bbb.com/122', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_B": [ - 'A1', 'B1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222', - 'B11', 'B12', - 'B111', 'B112', - 'B1111', 'B1112', 'B1121', 'B1122', - 'B121', 'B122', - 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/11', 'http://bbb.com/12', + 'http://bbb.com/111', 'http://bbb.com/112', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', + 'http://bbb.com/121', 'http://bbb.com/122', + 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_C": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', 'B1111', 'B1112', - 'B112', 'B121', 'B122', - 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', 'http://bbb.com/1111', 'http://bbb.com/1112', + 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_D": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', - 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_03_A": [ - 'C1', - 'C11', - 'C111', 'C1111', 'C11111', 'C11112', 'C1112', 'C11121', 'C11122', - 'C112', 'C1121', 'C11211', 'C11212', 'C1122', 'C11221', 'C11222', - 'C12', - 'C121', 'C1211', 'C12111', 'C12112', 'C1212', 'C12121', 'C12122', - 'C122', 'C1221', 'C12211', 'C12212', 'C1222', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', + 'http://ccc.com/111', 'http://ccc.com/1111', 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/1112', 'http://ccc.com/11121', 'http://ccc.com/11122', + 'http://ccc.com/112', 'http://ccc.com/1121', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/1122', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12', + 'http://ccc.com/121', 'http://ccc.com/1211', 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/1212', 'http://ccc.com/12121', 'http://ccc.com/12122', + 'http://ccc.com/122', 'http://ccc.com/1221', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/1222', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_B": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', - 'C1111', 'C1112', - 'C11111', 'C11112', 'C11121', 'C11122', - 'C1121', 'C1122', - 'C11211', 'C11212', 'C11221', 'C11222', - 'C121', 'C122', - 'C1211', 'C1212', - 'C12111', 'C12112', 'C12121', 'C12122', - 'C1221', 'C1222', - 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', + 'http://ccc.com/1111', 'http://ccc.com/1112', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', + 'http://ccc.com/1121', 'http://ccc.com/1122', + 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1211', 'http://ccc.com/1212', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', + 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_C": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', 'C12111', 'C12112', - 'C1212', 'C1221', 'C1222', - 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', 'http://ccc.com/12111', 'http://ccc.com/12112', + 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_D": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } @@ -447,31 +504,40 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): max_next_requests=max_next_requests, ) + def get_settings(self): + settings = super(BackendSequenceTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.DFSCrawlingStrategy' + return settings + class BFSBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } diff --git a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py index 0dceaaa7d..98ebdc400 100644 --- a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py +++ b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py @@ -28,9 +28,6 @@ class SQLAlchemyBFS(backends.BFSBackendTest): backend_class = 'frontera.contrib.backends.sqlalchemy.BFS' -class SQLAlchemyRevisiting(RevisitingBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' - #---------------------------------------------------- # SQLite Memory @@ -59,10 +56,6 @@ class TestSQLiteMemoryBFS(SQLAlchemyBFS, SQLiteMemory): pass -class TestSQLiteMemoryRevisiting(SQLAlchemyRevisiting): - pass - - #---------------------------------------------------- # SQLite File #---------------------------------------------------- @@ -104,6 +97,7 @@ class TestSQLiteFileBFS(SQLAlchemyBFS, SQLiteFile): pass + #---------------------------------------------------- # DB Backend test base #---------------------------------------------------- @@ -173,6 +167,7 @@ class TestMysqlBFS(Mysql, SQLAlchemyBFS): pass + #---------------------------------------------------- # Postgres #---------------------------------------------------- @@ -208,3 +203,4 @@ class TestPostgresDFS(Postgres, SQLAlchemyDFS): class TestPostgresBFS(Postgres, SQLAlchemyBFS): pass + From 38de8de08b66566e5b9f44aabc8103e799251b2d Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 09:55:14 +0200 Subject: [PATCH 181/273] default crawling strategy --- frontera/settings/default_settings.py | 1 + frontera/worker/strategies/basic.py | 25 +++++++++++++++++++++++++ tests/backends.py | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 frontera/worker/strategies/basic.py diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 6427573e7..a877ad48c 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -67,6 +67,7 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False +STRATEGY = 'frontera.worker.strategies.basic.BasicCrawlingStrategy' STRATEGY_ARGS = {} SW_FLUSH_INTERVAL = 300 TEST_MODE = False diff --git a/frontera/worker/strategies/basic.py b/frontera/worker/strategies/basic.py new file mode 100644 index 000000000..84819370a --- /dev/null +++ b/frontera/worker/strategies/basic.py @@ -0,0 +1,25 @@ +from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.core.components import States + + +class BasicCrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self.create_request(url) + self.schedule(r) + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + self.schedule(link) + link.meta[b'state'] = States.QUEUED + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def page_error(self, request, error): + request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/tests/backends.py b/tests/backends.py index 41180778c..91530993c 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -14,7 +14,7 @@ def read_seeds(self, stream): for url in stream: url = url.strip() r = self._create_request(url) - self.schedule(r, 1.0) + self.schedule(r) def _create_request(self, url): return self.create_request(url=url, From bcc3d75d42854c34bc81ea2babe3d8127ecc6f39 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 09:55:30 +0200 Subject: [PATCH 182/273] scrapy mws test fix --- tests/test_scrapy.py | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/tests/test_scrapy.py b/tests/test_scrapy.py index a08d35e05..4090d716c 100644 --- a/tests/test_scrapy.py +++ b/tests/test_scrapy.py @@ -90,7 +90,7 @@ def test_request_response_converters(): class TestFronteraMiddlewaresWithScrapy(unittest.TestCase): - def setUp(self): + def init_smw(self, custom_settings): class TestSpider(Spider): name = 'test' @@ -101,10 +101,6 @@ class TestSpider(Spider): # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares - - custom_settings = { - 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} - } crawler = get_crawler(self.spider, custom_settings) self.add_frontera_scheduler(crawler) self.smw = SpiderMiddlewareManager.from_crawler(crawler) @@ -125,8 +121,7 @@ def __init__(self, scheduler): crawler.engine = Engine(scheduler) - def test_frontera_scheduler_spider_middleware_with_referer_middleware(self): - + def perform_test(self, output_func): def request_callback(response): yield Request('http://frontera.org') @@ -143,20 +138,37 @@ def call_request_callback(result, request, spider): dfd.addCallback(request.callback) return dfd - def test_middleware_output(result): - out = list(result) - self.assertEquals(len(out), 1) - self.assertIsInstance(out[0], Request) - self.assertIn('Referer', out[0].headers) - self.assertEquals(out[0].headers['Referer'], to_bytes(res.url)) - def test_failure(failure): # work around for test to fail with detailed traceback self._observer._errors.append(failure) dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) - dfd.addCallback(test_middleware_output) + dfd.addCallback(output_func) dfd.addErrback(test_failure) dfd.callback(res) + + def test_frontera_scheduler_spider_mw_with_referer_mw(self): + + def test_middleware_output(result): + out = list(result) + # Frontera swallows requests but passes items + self.assertEquals(len(out), 0) + + self.init_smw({ + 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} + }) + self.perform_test(test_middleware_output) + + def test_frontera_scheduler_spider_mw_without_referer_mw(self): + + def test_middleware_output(result): + out = list(result) + self.assertEquals(len(out), 1) + self.assertIsInstance(out[0], Request) + self.assertIn('Referer', out[0].headers) + self.assertEquals(out[0].headers['Referer'], to_bytes('http://www.scrapy.org')) + + self.init_smw({}) + self.perform_test(test_middleware_output) From ac0da26b53f8a21e2e453a643220f248cb0c21cf Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 11:01:26 +0200 Subject: [PATCH 183/273] dbw test cases fix --- tests/mocks/components.py | 6 +++--- tests/test_worker_db.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/mocks/components.py b/tests/mocks/components.py index aa6c76b69..d073f0e71 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -68,9 +68,9 @@ def count(self): return len(self.requests) def schedule(self, batch): - for obj in batch: - if obj[3]: - self.requests.append(Request(obj[2].url, meta={b'fingerprint': obj[0], b'score': obj[1]})) + for fingerprint, score, request, is_schedule in batch: + if is_schedule: + self.requests.append(request) class FakeBackend(FakeMiddleware, Backend): diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index 47fc6f3ec..8750b58d9 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -5,9 +5,9 @@ import unittest -r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0}) -r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0}) -r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0}) +r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.example.com'}}) +r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.scrapy.org'}}) +r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.dmoz.org'}}) class TestDBWorker(unittest.TestCase): @@ -20,7 +20,7 @@ def dbw_setup(self, distributed=False): settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' - return DBWorker(settings, False, False, False, partitions="0") + return DBWorker(settings, False, False, False, partitions=[0,1,2,3]) def test_page_crawled(self): dbw = self.dbw_setup() From 0ac5f9ce61e2ca796f4a3f6a3eb66f785b955ff3 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 11:34:14 +0200 Subject: [PATCH 184/273] memory backends test fixes --- tests/backends.py | 21 ++++++++++++++++++--- tests/test_overused_buffer.py | 9 +++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/backends.py b/tests/backends.py index 91530993c..386f15b67 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -10,6 +10,10 @@ class BasicCrawlingStrategy(BaseCrawlingStrategy): + def __init__(self, manager, args, scheduled_stream, states_context): + super(BasicCrawlingStrategy, self).__init__(manager, args, scheduled_stream, states_context) + self._id = 0 + def read_seeds(self, stream): for url in stream: url = url.strip() @@ -17,13 +21,16 @@ def read_seeds(self, stream): self.schedule(r) def _create_request(self, url): - return self.create_request(url=url, + r = self.create_request(url=url, headers={ b'X-Important-Header': b'Frontera' }, method=b'POST', cookies={b'currency': b'USD'}, - meta={b'this_param': b'should be passed over'}) + meta={b'this_param': b'should be passed over', + b'id': self._id}) + self._id += 1 + return r def filter_extracted_links(self, request, links): return links @@ -505,7 +512,7 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): ) def get_settings(self): - settings = super(BackendSequenceTest, self).get_settings() + settings = super(DFSBackendTest, self).get_settings() settings.TEST_MODE = True settings.LOGGING_MANAGER_ENABLED = False settings.LOGGING_BACKEND_ENABLED = False @@ -569,6 +576,14 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): expected_sequence=self.EXPECTED_SEQUENCES[expected_sequence], max_next_requests=max_next_requests, ) + def get_settings(self): + settings = super(BFSBackendTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.BFSCrawlingStrategy' + return settings class RANDOMBackendTest(BackendSequenceTest): diff --git a/tests/test_overused_buffer.py b/tests/test_overused_buffer.py index 96524a4c5..4213fe6b2 100644 --- a/tests/test_overused_buffer.py +++ b/tests/test_overused_buffer.py @@ -22,6 +22,15 @@ class DFSOverusedBackendTest(BackendSequenceTest): ] } + def get_settings(self): + settings = super(DFSOverusedBackendTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.DFSCrawlingStrategy' + return settings + def test_sequence1(self): sequence = self.get_sequence(TEST_SITES['SITE_09'], max_next_requests=5, downloader_simulator=DownloaderSimulator(rate=1)) From d9b1d648430fd1b79aba82e1dba112f58e90c7e8 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 11:53:15 +0200 Subject: [PATCH 185/273] Redis test case fix --- tests/contrib/backends/redis_backend/test_redis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 95bdbc73b..18fb897af 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -3,7 +3,7 @@ from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL from frontera.contrib.backends.redis_backend import RedisBackend, RedisMetadata, RedisQueue, RedisState -from frontera.core.manager import BaseContext +from frontera.core.manager import WorkerFrontierManager from frontera.settings import Settings from redis import ConnectionPool, StrictRedis from time import time @@ -39,7 +39,7 @@ class RedisQueueTest(TestCase): @staticmethod def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') - return RedisQueue(BaseContext.from_settings(settings), get_pool(), partitions, True) + return RedisQueue(WorkerFrontierManager.from_settings(settings), get_pool(), partitions, True) def test_scheduling_past_1part_5(self): subject = self.setup_subject(1) @@ -483,7 +483,7 @@ def setup_subject(partitions): settings = Settings(module='frontera.settings.default_settings') settings.set('SPIDER_FEED_PARTITIONS', partitions) settings.set('REDIS_DROP_ALL_TABLES', True) - return RedisBackend.db_worker(FrontierManager.from_settings(settings)) + return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True)) def test_get_next_request(self): subject = self.setup_subject(2) From 1de3c4b69d41a2d3d4700f850e30708530cd6d53 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 12:05:20 +0200 Subject: [PATCH 186/273] syntax --- frontera/contrib/backends/hbase/__init__.py | 34 +++++++++---------- .../contrib/backends/hbase/domaincache.py | 20 ++++++----- frontera/contrib/backends/memory/__init__.py | 1 - frontera/contrib/messagebus/kafka/async.py | 1 + frontera/core/manager.py | 28 +++++++-------- frontera/core/models.py | 2 +- frontera/worker/components/batch_generator.py | 3 +- .../worker/components/incoming_consumer.py | 2 +- .../worker/components/scoring_consumer.py | 2 +- frontera/worker/strategy.py | 5 +-- 10 files changed, 50 insertions(+), 48 deletions(-) diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index 43fa5fdbc..ac10eb12b 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -24,7 +24,6 @@ from collections import defaultdict, Iterable import logging - _pack_functions = { 'url': to_bytes, 'depth': lambda x: pack('>I', 0), @@ -83,7 +82,6 @@ def popitem(self): class HBaseQueue(Queue): - GET_RETRIES = 3 def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): @@ -106,6 +104,7 @@ def __init__(self, connection, partitions, table_name, drop=False, use_snappy=Fa class DumbResponse: pass + self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request) @@ -145,6 +144,7 @@ def _schedule(self, batch, timestamp): :param batch: iterable of Request objects :return: """ + def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError @@ -209,7 +209,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): min_requests = kwargs.pop('min_requests') min_hosts = kwargs.pop('min_hosts', None) max_requests_per_host = kwargs.pop('max_requests_per_host', None) - assert(max_n_requests > min_requests) + assert (max_n_requests > min_requests) table = self.connection.table(self.table_name) meta_map = {} @@ -218,7 +218,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): tries = 0 count = 0 prefix = to_bytes('%d_' % partition_id) - now_ts = int(time()) + # now_ts = int(time()) # TODO: figure out how to use filter here, Thrift filter above causes full scan # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: @@ -299,7 +299,6 @@ def count(self): class HBaseState(States): - def __init__(self, connection, table_name, cache_size_limit, write_log_size, drop_all_tables): self.connection = connection @@ -344,7 +343,7 @@ def flush(self): def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] - self._update_cache_stats(hits=len(fingerprints)-len(to_fetch), + self._update_cache_stats(hits=len(fingerprints) - len(to_fetch), misses=len(to_fetch)) if not to_fetch: return @@ -425,7 +424,8 @@ def page_crawled(self, response): created_at=utcnow_timestamp(), dest_fprint=redirect_fprints[-1]) self.batch.put(fprint, obj) - obj = prepare_hbase_object(status_code=response.status_code, headers=headers, content=response.body) if self.store_content else \ + obj = prepare_hbase_object(status_code=response.status_code, headers=headers, + content=response.body) if self.store_content else \ prepare_hbase_object(status_code=response.status_code, headers=headers) self.batch.put(unhexlify(response.meta[b'fingerprint']), obj) @@ -498,15 +498,15 @@ def __init__(self, manager): def _init_states(self, settings): self._states = HBaseState(connection=self.connection, - table_name=settings.get('HBASE_STATES_TABLE'), - cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), - write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), - drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) + table_name=settings.get('HBASE_STATES_TABLE'), + cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), + write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), + drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) def _init_queue(self, settings): self._queue = HBaseQueue(self.connection, self.queue_partitions, - settings.get('HBASE_QUEUE_TABLE'), drop=settings.get('HBASE_DROP_ALL_TABLES'), - use_snappy=settings.get('HBASE_USE_SNAPPY')) + settings.get('HBASE_QUEUE_TABLE'), drop=settings.get('HBASE_DROP_ALL_TABLES'), + use_snappy=settings.get('HBASE_USE_SNAPPY')) def _init_metadata(self, settings): self._metadata = HBaseMetadata(self.connection, settings.get('HBASE_METADATA_TABLE'), @@ -578,10 +578,10 @@ def get_next_requests(self, max_next_requests, **kwargs): results = [] for partition_id in set(kwargs.pop('partitions', [i for i in range(self.queue_partitions)])): requests = self.queue.get_next_requests( - max_next_requests, partition_id, - min_requests=self._min_requests, - min_hosts=self._min_hosts, - max_requests_per_host=self._max_requests_per_host) + max_next_requests, partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host) results.extend(requests) self.logger.debug("Got %d requests for partition id %d", len(requests), partition_id) return results diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 088353929..e8eca46d0 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -5,17 +5,19 @@ from time import time import six -from frontera.contrib.backends.hbase.utils import HardenedBatch -from frontera.utils.msgpack import restruct_for_pack from msgpack import packb, unpackb from w3lib.util import to_bytes, to_native_str -DEFAULT_HBASE_THRIFT_FRAME_SIZE = 2097152 +from frontera.contrib.backends.hbase.utils import HardenedBatch +from frontera.utils.msgpack import restruct_for_pack import collections from cachetools import Cache +DEFAULT_HBASE_THRIFT_FRAME_SIZE = 2097152 + + class LRUCache(Cache): """Least Recently Used (LRU) cache implementation.""" @@ -109,10 +111,10 @@ def __getitem__(self, key): self._key_check(key) try: value = Cache.__getitem__(self, key) - except KeyError as ke1: + except KeyError: try: value = self._second_gen[key] - except KeyError as ke2: + except KeyError: try: value = self._get_item(key) except KeyError as ke3: @@ -168,7 +170,7 @@ def popitem(self): if len(self._second_gen) >= self.batch_size: self._flush_second_gen() self._second_gen.clear() - self.stats["flushes"]+=1 + self.stats["flushes"] += 1 # These methods aren't meant to be implemented @@ -206,7 +208,7 @@ def setdefault(self, key, default=None): HBase-optimized setdefault """ self._key_check(key) - self.stats["gets"]+=1 + self.stats["gets"] += 1 self._log_and_rotate_stats() if super(DomainCache, self).__contains__(key) or key in self._second_gen: value = self[key] @@ -256,7 +258,7 @@ def _get_domain_table(self, connection, table_name): return connection.table(table_name) def _get_item(self, key): - self.stats["hbase_gets"]+=1 + self.stats["hbase_gets"] += 1 hbase_key = to_bytes(key) row = self._table.row(hbase_key) if not row: @@ -293,7 +295,7 @@ def _store_item_batch(self, key, value): self.logger.exception("Exception happened during item storing, %d tries left", tries) data_lengths = dict((k, len(v)) for k, v in six.iteritems(data)) self.logger.info("RK %s per-column lengths %s", key, str(data_lengths)) - for k ,length in data_lengths.items(): + for k, length in data_lengths.items(): if length > self.MAX_VALUE_SIZE: self.logger.info("Dropping key %s", k) del data[k] diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index 2ca2dc261..d3db9a75c 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -310,7 +310,6 @@ def db_worker(cls, manager): return cls(manager) - BASE = MemoryBaseBackend FIFO = MemoryFIFOBackend LIFO = MemoryLIFOBackend diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async.py index 158e3a0e9..97315341a 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async.py @@ -223,6 +223,7 @@ def _send_offset_request(self, partitions, timestamp): future_request = Future() _f = self._client.send(node_id, request) _f.add_callback(self._handle_offset_response, partitions, future_request) + def errback(e): log.error("Offset request errback error %s", e) future_request.failure(e) diff --git a/frontera/core/manager.py b/frontera/core/manager.py index c005d56ff..1fc7405b2 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -172,7 +172,7 @@ def __init__(self, request_model, response_model, settings=None): self._logger = logging.getLogger("manager") # Log frontier manager starting - self._logger.info('-'*80) + self._logger.info('-' * 80) self._logger.info('Starting Frontier Manager...') # Load request model @@ -294,9 +294,9 @@ def links_extracted_after(self, request, filtered): def request_error(self, request, error): self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) return self._process_components(method_name='request_error', - obj=request, - return_classes=self.request_model, - error=error) + obj=request, + return_classes=self.request_model, + error=error) class LocalFrontierManager(BaseContext, StrategyComponentsPipelineMixin, BaseManager): @@ -305,6 +305,7 @@ class LocalFrontierManager(BaseContext, StrategyComponentsPipelineMixin, BaseMan providing an API to interact with. It's also responsible of loading and communicating all different frontier components. """ + def __init__(self, request_model, response_model, backend, strategy_class, strategy_args, middlewares=None, test_mode=False, max_requests=0, max_next_requests=0, auto_start=True, settings=None, canonicalsolver=None): @@ -370,7 +371,7 @@ def __init__(self, request_model, response_model, backend, strategy_class, strat # Log frontier manager start self._logger.info('Frontier Manager Started!') - self._logger.info('-'*80) + self._logger.info('-' * 80) # start/stop self._started = False @@ -519,7 +520,7 @@ def get_next_requests(self, max_next_requests=0, **kwargs): if not max_next_requests: max_next_requests = self.max_requests - self.n_requests else: - if self.n_requests+max_next_requests > self.max_requests: + if self.n_requests + max_next_requests > self.max_requests: max_next_requests = self.max_requests - self.n_requests # get next requests @@ -601,7 +602,7 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No self._process_components('create_request', obj=r, return_classes=self.request_model, - components=(0,1)) + components=(0, 1)) return r def _check_startstop(self): @@ -614,6 +615,7 @@ class WorkerFrontierManager(BaseContext, StrategyComponentsPipelineMixin): The :class:`WorkerFrontierManager ` class role is to instantiate the core components and is used mainly by workers. """ + def __init__(self, settings, request_model, response_model, backend, max_next_requests, strategy_class=None, strategy_args=None, scoring_stream=None, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): @@ -652,7 +654,7 @@ def __init__(self, settings, request_model, response_model, backend, max_next_re if strategy_worker: StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, scoring_stream, middlewares=middlewares, canonicalsolver=canonicalsolver, - db_worker=db_worker,strategy_worker=strategy_worker) + db_worker=db_worker, strategy_worker=strategy_worker) # Init frontier components pipeline # Some code relies on the order, modify carefully self._components_pipeline = [ @@ -660,11 +662,11 @@ def __init__(self, settings, request_model, response_model, backend, max_next_re ('CanonicalSolver', self.canonicalsolver, False), ] if db_worker: - ComponentsPipelineMixin.__init__(self, backend, db_worker=db_worker,strategy_worker=strategy_worker) + ComponentsPipelineMixin.__init__(self, backend, db_worker=db_worker, strategy_worker=strategy_worker) # Log frontier manager start self._logger.info('Frontier Manager Started!') - self._logger.info('-'*80) + self._logger.info('-' * 80) @classmethod def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): @@ -672,7 +674,7 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, sc kwargs = { 'request_model': manager_settings.REQUEST_MODEL, 'response_model': manager_settings.RESPONSE_MODEL, - 'backend' : manager_settings.BACKEND, + 'backend': manager_settings.BACKEND, 'max_next_requests': manager_settings.MAX_NEXT_REQUESTS, 'settings': manager_settings, 'db_worker': db_worker, @@ -738,7 +740,6 @@ def links_extracted(self, request, links): @six.add_metaclass(ABCMeta) class UpdateScoreStream(object): - @abstractmethod def send(self, request, score=1.0, dont_queue=False): pass @@ -770,7 +771,6 @@ def send(self, request, score=1.0, dont_queue=False): class StatesContext(object): - def __init__(self, states): self._requests = [] self.states = states @@ -800,4 +800,4 @@ def release(self): def flush(self): self.logger.info("Flushing states") self.states.flush() - self.logger.info("Flushing of states finished") \ No newline at end of file + self.logger.info("Flushing of states finished") diff --git a/frontera/core/models.py b/frontera/core/models.py index 480939a4b..0b1c37423 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -16,6 +16,7 @@ class Request(FrontierObject): :class:`Response ` object when crawled. """ + def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): """ :param string url: URL to send. @@ -86,7 +87,6 @@ def __str__(self): def __hash__(self): return hash(self.meta[b'fingerprint']) - __repr__ = __str__ diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 94763797d..78f97747a 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -85,7 +85,7 @@ def _handle_partition(self, partition_id): self.spider_feed_producer.send(self.get_key_function(request), eo) except Exception: self.logger.exception("Sending message error fingerprint: %s, url: %s" % - (self.get_fingerprint(request), request.url)) + (self.get_fingerprint(request), request.url)) finally: count += 1 hostname = self.get_hostname(request) @@ -121,7 +121,6 @@ def rotate_and_log_domain_stats(self): self.domain_stats[partition_id] = defaultdict(int) self.rotate_time = time() + self.domain_stats_interval - # --------------------------- Auxiliary tools -------------------------------- def get_fingerprint(self, request): diff --git a/frontera/worker/components/incoming_consumer.py b/frontera/worker/components/incoming_consumer.py index 98ad19527..43b5f83f9 100644 --- a/frontera/worker/components/incoming_consumer.py +++ b/frontera/worker/components/incoming_consumer.py @@ -59,7 +59,7 @@ def _handle_message(self, msg, stats): """Base logic to safely handle a message.""" try: self._handle_message_by_type(msg[0], msg, stats) - except Exception as exc: + except Exception: self.logger.exception("Error while handling a message") self.logger.debug("Message caused the error %s", str(msg)) diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py index eb40e7713..0fdd85244 100644 --- a/frontera/worker/components/scoring_consumer.py +++ b/frontera/worker/components/scoring_consumer.py @@ -31,7 +31,7 @@ def run(self): count=self.scoring_log_consumer_batch_size): try: msg = self.worker._decoder.decode(m) - except (KeyError, TypeError) as e: + except (KeyError, TypeError): self.logger.exception("Decoding error") continue else: diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 96968f91c..e9e3671d0 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -181,7 +181,7 @@ def work(self): for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: event = self._decoder.decode(m) - except (KeyError, TypeError) as e: + except (KeyError, TypeError): logger.exception("Decoding error") logger.debug("Message %s", hexlify(m)) continue @@ -242,7 +242,7 @@ def errback_main(failure): def run_flush_states_task(): (self._flush_states_task.start(interval=self._flush_interval) - .addErrback(errback_flush_states)) + .addErrback(errback_flush_states)) def errback_flush_states(failure): log_failure(failure) @@ -326,6 +326,7 @@ class StrategyWorker(StatsExportMixin, BaseStrategyWorker): The additional features are provided by using mixin classes: - sending crawl stats to message bus """ + def get_stats_tags(self, settings, *args, **kwargs): return {'source': 'sw', 'partition_id': settings.get('SCORING_PARTITION_ID')} From 90484e44f4dcdd12c4d111ad634fda51cb63fcef Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 12:48:10 +0200 Subject: [PATCH 187/273] more work on style --- frontera/worker/components/__init__.py | 2 +- frontera/worker/strategy.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index c33129150..f52db7bf7 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -79,7 +79,7 @@ def loop(self): while not self.stop_event.is_set(): try: is_backoff_needed = self.run() - except Exception as exc: + except Exception: self.logger.exception('Exception in the main loop') else: if is_backoff_needed and self.run_backoff: diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index e9e3671d0..8e581b4db 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -70,7 +70,7 @@ def process(self): self.stats['consumed_request_error'] += 1 continue self.on_unknown_event(event) - except: + except Exception: logger.exception("Exception during processing") pass self.scoring_stream.flush() @@ -104,7 +104,7 @@ def collect(self, event): if typ == 'offset': return self.collect_unknown_event(event) - except: + except Exception: logger.exception("Error during event collection") pass @@ -224,7 +224,7 @@ def add_seeds(self, seeds_url): strategy.read_seeds(fh) try: fh.close() - except: + except Exception: logger.exception("Error during closing of seeds stream") pass self.update_score.flush() @@ -313,7 +313,7 @@ def _perform_shutdown(self, _=None): self.scoring_log_producer.close() if not self.add_seeds_mode: self.consumer.close() - except: + except Exception: logger.exception('Error on shutdown') def set_process_info(self, process_info): From 986fad4de21cdd6022afb672e0313989768cd4e1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 31 May 2018 15:01:30 +0200 Subject: [PATCH 188/273] fix of add seeds utility --- frontera/utils/add_seeds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/utils/add_seeds.py b/frontera/utils/add_seeds.py index 9fff305cf..d3952dd6d 100644 --- a/frontera/utils/add_seeds.py +++ b/frontera/utils/add_seeds.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from frontera.core.manager import FrontierManager +from frontera.core.manager import LocalFrontierManager from frontera.settings import Settings from frontera.logger.handlers import CONSOLE from argparse import ArgumentParser @@ -30,7 +30,7 @@ logger.info("Starting local seeds addition from file %s", args.seeds_file) -manager = FrontierManager.from_settings(settings) +manager = LocalFrontierManager.from_settings(settings) manager.add_seeds(fh) manager.stop() manager.close() From 2e042544b00eb972d6ad1a347715d358d8cdfd93 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 1 Jun 2018 19:47:14 +0200 Subject: [PATCH 189/273] fixing distributed run mode --- docs/source/index.rst | 13 ++- docs/source/topics/frontera-settings.rst | 86 ++++++++++++------- .../contrib/backends/remote/messagebus.py | 1 + frontera/contrib/messagebus/kafkabus.py | 2 +- .../contrib/scrapy/schedulers/frontier.py | 2 +- frontera/core/manager.py | 19 ++++ frontera/settings/default_settings.py | 1 + frontera/utils/managers.py | 5 +- frontera/worker/strategy.py | 3 + 9 files changed, 87 insertions(+), 45 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index d36ab0905..d2fb91520 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,16 +4,13 @@ Frontera |version| documentation ================================ -`Frontera`_ is a web crawling tool box, allowing to build crawlers of any scale and purpose. - -`Frontera`_ provides :ref:`crawl frontier ` framework by managing *when* and *what* to crawl next, -and checking for *crawling goal* accomplishment. - -Frontera also provides replication, sharding and isolation of all crawler components to scale and distribute it. +`Frontera`_ is a web crawling tool box, allowing to build crawlers of any scale and purpose. It includes: +* :ref:`crawl frontier ` framework managing *when* and *what* to crawl and checking for +crawling goal* accomplishment, +* workers, Scrapy wrappers, and data bus components to scale and distribute the crawler. Frontera contain components to allow creation of fully-operational web crawler with `Scrapy`_. Even though it was -originally designed for Scrapy, it can also be used with any other crawling framework/system as the framework offers -a generic tool box. +originally designed for Scrapy, it can also be used with any other crawling framework/system. Introduction diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index e3e3e48dc..0bd8c3a03 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -128,39 +128,6 @@ Default: ``frontera.contrib.canonicalsolvers.Basic`` The :class:`CanonicalSolver ` to be used by the frontier for resolving canonical URLs. For more info see :ref:`Canonical URL Solver `. -.. setting:: SPIDER_LOG_CONSUMER_BATCH_SIZE - -SPIDER_LOG_CONSUMER_BATCH_SIZE ------------------------------- - -Default: ``512`` - -This is a batch size used by strategy and db workers for consuming of spider log stream. Increasing it -will cause worker to spend more time on every task, but processing more items per task, therefore leaving less time for -other tasks during some fixed time interval. Reducing it will result to running several tasks within the same time -interval, but with less overall efficiency. Use it when your consumers too slow, or too fast. - -.. setting:: SCORING_LOG_CONSUMER_BATCH_SIZE - -SCORING_LOG_CONSUMER_BATCH_SIZE -------------------------------- - -Default: ``512`` - -This is a batch size used by db worker for consuming of scoring log stream. Use it when you need to adjust scoring log -consumption speed. - - -.. setting:: CRAWLING_STRATEGY - -CRAWLING_STRATEGY ------------------ - -Default: ``None`` - -The path to crawling strategy class, instantiated and used in :term:`strategy worker` to prioritize and stop crawling in -distributed run mode. - .. setting:: DELAY_ON_EMPTY DELAY_ON_EMPTY @@ -194,6 +161,17 @@ Default: ``5.0`` Time process should block until requested amount of data will be received from message bus. This is a general message bus setting with obsolete Kafka-related name. + +.. setting:: LOCAL_MODE + +LOCAL_MODE +---------- + +Default: ``True`` + +Sets single process run mode. Crawling strategy together with backend are used from the same spider process. + + .. setting:: LOGGING_CONFIG LOGGING_CONFIG @@ -342,6 +320,29 @@ Default: ``'frontera.core.models.Response'`` The :class:`Response ` model to be used by the frontier. +.. setting:: SPIDER_LOG_CONSUMER_BATCH_SIZE + +SPIDER_LOG_CONSUMER_BATCH_SIZE +------------------------------ + +Default: ``512`` + +This is a batch size used by strategy and db workers for consuming of spider log stream. Increasing it +will cause worker to spend more time on every task, but processing more items per task, therefore leaving less time for +other tasks during some fixed time interval. Reducing it will result to running several tasks within the same time +interval, but with less overall efficiency. Use it when your consumers too slow, or too fast. + +.. setting:: SCORING_LOG_CONSUMER_BATCH_SIZE + +SCORING_LOG_CONSUMER_BATCH_SIZE +------------------------------- + +Default: ``512`` + +This is a batch size used by db worker for consuming of scoring log stream. Use it when you need to adjust scoring log +consumption speed. + + .. setting:: SCORING_PARTITION_ID SCORING_PARTITION_ID @@ -400,6 +401,25 @@ Default: ``False`` Determines if content should be sent over the message bus and stored in the backend: a serious performance killer. +.. setting:: STRATEGY + +STRATEGY +-------- + +Default: ``frontera.worker.strategies.basic.BasicCrawlingStrategy`` + +The path to crawling strategy class. + +.. setting:: STRATEGY_ARGS + +STRATEGY_ARGS +------------- + +Default: ``{}`` + +Dict with default arguments for crawling strategy. Can be overridien with command line option in +:term:`strategy worker`. + .. setting:: SW_FLUSH_INTERVAL SW_FLUSH_INTERVAL diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index bfc1bc82d..4d5b564b1 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -42,6 +42,7 @@ def frontier_start(self): def frontier_stop(self): self.spider_log_producer.flush() + self.consumer.close() def add_seeds(self, seeds): raise NotImplemented("The seeds addition using spider log isn't allowed") diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 9add0d943..bae555a6d 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -224,7 +224,7 @@ def producer(self): buffer_memory=DEFAULT_BUFFER_MEMORY) -class StatsLogStream(BaseStatsLogStream, ScoringLogStream): +class StatsLogStream(ScoringLogStream, BaseStatsLogStream): """Stats log stream implementation for Kafka message bus. The interface is the same as for scoring log stream, so it's better diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index a677392fc..962ee8a36 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -130,7 +130,7 @@ def open(self, spider): def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() - self.stats_manager.set_iterations(self.frontier.manager.iteration) + self.stats_manager.set_iterations(getattr(self.frontier.manager, 'iteration', 0)) self.stats_manager.set_pending_requests(len(self)) def __len__(self): diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 1fc7405b2..f61e66186 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -184,6 +184,7 @@ def __init__(self, request_model, response_model, settings=None): self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__ + self.test_mode = False @classmethod def from_settings(cls, settings=None): @@ -710,12 +711,16 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No class SpiderFrontierManager(BaseContext, ComponentsPipelineMixin, BaseManager): + + auto_start = False + def __init__(self, request_model, response_model, backend, middlewares, max_next_requests, settings, canonicalsolver): BaseContext.__init__(self, request_model, response_model, settings=settings) ComponentsPipelineMixin.__init__(self, backend, middlewares=middlewares, canonicalsolver=canonicalsolver, db_worker=False, strategy_worker=False) + self.max_next_requests = max_next_requests self._components_pipeline = [ ('Middleware', self.middlewares, True), ('CanonicalSolver', self.canonicalsolver, False), @@ -733,10 +738,24 @@ def from_settings(cls, settings=None): settings=manager_settings, canonicalsolver=manager_settings.CANONICAL_SOLVER) + def get_next_requests(self, max_next_requests=0, **kwargs): + return super(SpiderFrontierManager, self).get_next_requests(max_next_requests=max_next_requests or self.max_next_requests, **kwargs) + def links_extracted(self, request, links): super(SpiderFrontierManager, self).links_extracted(request, links) super(SpiderFrontierManager, self).links_extracted_after(request, links) + @property + def finished(self): + return False + + def start(self): + self._logger.debug('START') + self._process_components(method_name='frontier_start') + + def stop(self): + super(SpiderFrontierManager, self).close() + @six.add_metaclass(ABCMeta) class UpdateScoreStream(object): diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index a877ad48c..bb2c4898b 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -25,6 +25,7 @@ HBASE_STATE_WRITE_LOG_SIZE = 15000 HBASE_QUEUE_TABLE = 'queue' KAFKA_GET_TIMEOUT = 5.0 +LOCAL_MODE = True MAX_NEXT_REQUESTS = 64 MAX_REQUESTS = 0 MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 61561fe2d..750c5dcb7 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -1,11 +1,12 @@ from __future__ import absolute_import -from frontera.core.manager import LocalFrontierManager +from frontera.core.manager import LocalFrontierManager, SpiderFrontierManager from .converters import BaseRequestConverter, BaseResponseConverter class FrontierManagerWrapper(object): def __init__(self, settings, manager=None): - manager = manager or LocalFrontierManager + if manager is None: + manager = LocalFrontierManager if settings.get("LOCAL_MODE") == True else SpiderFrontierManager self.manager = manager.from_settings(settings) self.request_converter = None self.response_converter = None diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index 8e581b4db..f0cdb5aff 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -362,6 +362,9 @@ def setup_environment(): partition_id) settings.set('SCORING_PARTITION_ID', partition_id) + if args.port: + settings.set('JSONRPC_PORT', args.port) + strategy_args = {} if args.args: for arg in args.args: From b90275b647c78833863653d22fc47e0727fa54c6 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 4 Jun 2018 12:09:04 +0200 Subject: [PATCH 190/273] auto_start/test_mode attrs refactor --- frontera/core/manager.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/frontera/core/manager.py b/frontera/core/manager.py index f61e66186..3e499b113 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -184,7 +184,6 @@ def __init__(self, request_model, response_model, settings=None): self._response_model = load_object(response_model) assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ self._response_model.__name__ - self.test_mode = False @classmethod def from_settings(cls, settings=None): @@ -691,6 +690,10 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, sc }) return WorkerFrontierManager(**kwargs) + @property + def test_mode(self): + return False + def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ Creates request and applies middleware and canonical solver pipelines. @@ -712,8 +715,6 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No class SpiderFrontierManager(BaseContext, ComponentsPipelineMixin, BaseManager): - auto_start = False - def __init__(self, request_model, response_model, backend, middlewares, max_next_requests, settings, canonicalsolver): BaseContext.__init__(self, request_model, response_model, settings=settings) @@ -738,6 +739,14 @@ def from_settings(cls, settings=None): settings=manager_settings, canonicalsolver=manager_settings.CANONICAL_SOLVER) + @property + def test_mode(self): + return False + + @property + def auto_start(self): + return True + def get_next_requests(self, max_next_requests=0, **kwargs): return super(SpiderFrontierManager, self).get_next_requests(max_next_requests=max_next_requests or self.max_next_requests, **kwargs) From 7b1d3fb89dbcdfb0158ebf5e9cf6b41217d6ca20 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 18 Jun 2018 14:39:45 +0200 Subject: [PATCH 191/273] move of BaseCrawlingStrategy, BFS/DFS strategy --- docs/source/topics/quick-start-single.rst | 30 +++++++++++++++---- examples/cluster/bc/broadcrawl/__init__.py | 2 +- .../strategies => strategy}/__init__.py | 6 +--- .../{worker/strategies => strategy}/basic.py | 2 +- .../strategies/bfs.py => strategy/depth.py} | 22 +++++++++----- tests/backends.py | 2 +- tests/mocks/components.py | 2 +- tests/test_strategy.py | 2 +- 8 files changed, 44 insertions(+), 24 deletions(-) rename frontera/{worker/strategies => strategy}/__init__.py (97%) rename frontera/{worker/strategies => strategy}/basic.py (92%) rename frontera/{worker/strategies/bfs.py => strategy/depth.py} (61%) diff --git a/docs/source/topics/quick-start-single.rst b/docs/source/topics/quick-start-single.rst index f1fe6b39d..6690f686b 100644 --- a/docs/source/topics/quick-start-single.rst +++ b/docs/source/topics/quick-start-single.rst @@ -2,8 +2,13 @@ Quick start single process ========================== -1. Create your spider -===================== +The idea is that you develop and debug crawling strategy in single process mode locally and use distributed one when +deploying crawling strategy for crawling in production at scale. Single process is also good as a first step to get +something running quickly. + + +1. Create your Scrapy spider +============================ Create your Scrapy project as you usually do. Enter a directory where you’d like to store your code and then run:: @@ -49,6 +54,23 @@ Configure frontier settings to use a built-in backend like in-memory BFS:: BACKEND = 'frontera.contrib.backends.memory.BFS' + +5. Choose the crawling strategy +=============================== + +Here are the options you would need to redefine when running in single process mode the crawler configured for +distributed mode: + +# all your distributed options, probably imported from other modules + +SPIDER_FEED_PARTITIONS = 1 +SPIDER_LOG_PARTITIONS = 1 + +STRATEGY = "workers.strategy.Contacts" + +This config will be used by Scrapy, instead of distributed crawler config. + + 5. Run the spider ================= @@ -81,7 +103,3 @@ Frontera provides many powerful features for making frontier management easy and * Logging facility that you can hook on to for catching errors and debug your frontiers. - - - - diff --git a/examples/cluster/bc/broadcrawl/__init__.py b/examples/cluster/bc/broadcrawl/__init__.py index a2894652b..679f7d7fc 100644 --- a/examples/cluster/bc/broadcrawl/__init__.py +++ b/examples/cluster/bc/broadcrawl/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from frontera.core.components import States -from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.strategy import BaseCrawlingStrategy from frontera.contrib.backends.hbase import HBaseBackend from cachetools import LRUCache from msgpack import packb, unpackb diff --git a/frontera/worker/strategies/__init__.py b/frontera/strategy/__init__.py similarity index 97% rename from frontera/worker/strategies/__init__.py rename to frontera/strategy/__init__.py index 894266f11..1fc3b431a 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/strategy/__init__.py @@ -1,9 +1,5 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.core.models import Request -from frontera.contrib.middlewares.fingerprint import UrlFingerprintMiddleware - from abc import ABCMeta, abstractmethod + import six diff --git a/frontera/worker/strategies/basic.py b/frontera/strategy/basic.py similarity index 92% rename from frontera/worker/strategies/basic.py rename to frontera/strategy/basic.py index 84819370a..d6a70731b 100644 --- a/frontera/worker/strategies/basic.py +++ b/frontera/strategy/basic.py @@ -1,5 +1,5 @@ -from frontera.worker.strategies import BaseCrawlingStrategy from frontera.core.components import States +from frontera.strategy import BaseCrawlingStrategy class BasicCrawlingStrategy(BaseCrawlingStrategy): diff --git a/frontera/worker/strategies/bfs.py b/frontera/strategy/depth.py similarity index 61% rename from frontera/worker/strategies/bfs.py rename to frontera/strategy/depth.py index eb9a8b8cd..fc1c99d95 100644 --- a/frontera/worker/strategies/bfs.py +++ b/frontera/strategy/depth.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from six.moves.urllib.parse import urlparse from frontera.core.components import States -from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.strategy import BaseCrawlingStrategy -class CrawlingStrategy(BaseCrawlingStrategy): +class BreadthFirstCrawlingStrategy(BaseCrawlingStrategy): def read_seeds(self, fh): for url in fh: url = url.strip() @@ -13,6 +12,7 @@ def read_seeds(self, fh): self.refresh_states(req) if req.meta[b'state'] is States.NOT_CRAWLED: req.meta[b'state'] = States.QUEUED + req.meta[b'depth'] = 0 self.schedule(req) def page_crawled(self, response): @@ -23,15 +23,21 @@ def filter_extracted_links(self, request, links): def links_extracted(self, request, links): for link in links: + link.meta[b'depth'] = request.meta[b'depth'] + 1 if link.meta[b'state'] is States.NOT_CRAWLED: link.meta[b'state'] = States.QUEUED - self.schedule(link, self.get_score(link.url)) + self.schedule(link, self.get_score(link)) def page_error(self, request, error): request.meta[b'state'] = States.ERROR self.schedule(request, score=0.0, dont_queue=True) - def get_score(self, url): - url_parts = urlparse(url) - path_parts = url_parts.path.split('/') - return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) + def get_score(self, link): + depth = float(link.meta[b'depth']) + return 1.0 - (depth / (depth + 1.0)) + + +class DepthFirstCrawlingStrategy(BreadthFirstCrawlingStrategy): + def get_score(self, link): + depth = float(link.meta[b'depth']) + return depth / (depth + 1.0) \ No newline at end of file diff --git a/tests/backends.py b/tests/backends.py index 386f15b67..595012d71 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -3,7 +3,7 @@ from frontera.core.components import States from frontera.core.manager import LocalFrontierManager -from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.strategy import BaseCrawlingStrategy from frontera import Settings, FrontierTester from frontera.utils import graphs from frontera.utils.tester import BaseDownloaderSimulator diff --git a/tests/mocks/components.py b/tests/mocks/components.py index d073f0e71..61255b38b 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -4,7 +4,7 @@ from frontera.contrib.backends.memory import MemoryStates from six.moves import range from frontera.core.models import Request -from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.strategy import BaseCrawlingStrategy from frontera.core.components import States diff --git a/tests/test_strategy.py b/tests/test_strategy.py index aa4f6201b..2771eea1f 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.strategy import BaseCrawlingStrategy from frontera.settings import Settings from frontera.core.manager import WorkerFrontierManager, StatesContext From 42722d1b8a8fe5be8d0d87a6c6d10afe9511217b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 18 Jun 2018 15:14:54 +0200 Subject: [PATCH 192/273] seeds have to be injected using crawling strategy --- .../scrapy/middlewares/seeds/__init__.py | 24 -------------- .../contrib/scrapy/middlewares/seeds/file.py | 32 ------------------- .../contrib/scrapy/middlewares/seeds/s3.py | 30 ----------------- 3 files changed, 86 deletions(-) delete mode 100644 frontera/contrib/scrapy/middlewares/seeds/__init__.py delete mode 100644 frontera/contrib/scrapy/middlewares/seeds/file.py delete mode 100644 frontera/contrib/scrapy/middlewares/seeds/s3.py diff --git a/frontera/contrib/scrapy/middlewares/seeds/__init__.py b/frontera/contrib/scrapy/middlewares/seeds/__init__.py deleted file mode 100644 index 09cd0b7cd..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import absolute_import - - -class SeedLoader(object): - def __init__(self, crawler): - self.crawler = crawler - self.configure(crawler.settings) - - def configure(self, settings): - raise NotImplementedError - - @classmethod - def from_crawler(cls, crawler): - return cls(crawler) - - def process_start_requests(self, start_requests, spider): - urls = [url for url in self.load_seeds() if not url.startswith('#')] - return [spider.make_requests_from_url(url) for url in urls] - - def load_seeds(self): - raise NotImplementedError - - - diff --git a/frontera/contrib/scrapy/middlewares/seeds/file.py b/frontera/contrib/scrapy/middlewares/seeds/file.py deleted file mode 100644 index c70953de0..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/file.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import absolute_import -import codecs - -from scrapy.exceptions import NotConfigured - -from frontera.contrib.scrapy.middlewares.seeds import SeedLoader - - -class FileSeedLoader(SeedLoader): - def configure(self, settings): - self.seeds_source = settings.get('SEEDS_SOURCE') - if not self.seeds_source: - raise NotConfigured - - def load_seeds(self): - # TODO check if it's an existing file or a folder - return self.load_seeds_from_file(self.seeds_source) - - def load_seeds_from_file(self, file_path): - with codecs.open(file_path, 'rU') as f: - return self.load_seeds_from_data((f)) - - def load_seeds_from_data(self, data): - seeds = [] - for seed in data: - clean_seed = self.clean_seed(seed) - if clean_seed: - seeds.append(clean_seed) - return seeds - - def clean_seed(self, url): - return url.strip('\t\n\r') diff --git a/frontera/contrib/scrapy/middlewares/seeds/s3.py b/frontera/contrib/scrapy/middlewares/seeds/s3.py deleted file mode 100644 index abaf3c3b6..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/s3.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse -from boto import connect_s3 -from scrapy.exceptions import NotConfigured - -from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader - - -class S3SeedLoader(FileSeedLoader): - def configure(self, settings): - source = settings.get('SEEDS_SOURCE') - u = urlparse(source) - if not u.hostname or not u.scheme == 's3': - raise NotConfigured - self.bucket_name = u.hostname - self.bucket_keys_prefix = u.path.lstrip('/') - self.s3_aws_access_key = settings.get('SEEDS_AWS_ACCESS_KEY') - self.s3_aws_secret_key = settings.get('SEEDS_AWS_SECRET_ACCESS_KEY') - - def load_seeds(self): - conn = connect_s3(self.s3_aws_access_key, - self.s3_aws_secret_key) - bucket = conn.get_bucket(self.bucket_name) - seeds = [] - for key in bucket.list(self.bucket_keys_prefix): - if key.name.endswith(".txt"): - data = key.get_contents_as_string(encoding='utf-8').split() - file_seeds = self.load_seeds_from_data(data) - seeds.extend(file_seeds) - return seeds From cd8d95ee49f766f44315f8e9e92df1af2738b0b8 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 18 Jun 2018 16:36:44 +0200 Subject: [PATCH 193/273] new CS guide, and single mode quick start --- docs/source/index.rst | 6 +- .../topics/custom_crawling_strategy.rst | 200 ++++++++++++++++++ docs/source/topics/own_crawling_strategy.rst | 26 --- docs/source/topics/quick-start-single.rst | 37 ++-- docs/source/topics/scrapy-integration.rst | 76 +------ 5 files changed, 226 insertions(+), 119 deletions(-) create mode 100644 docs/source/topics/custom_crawling_strategy.rst delete mode 100644 docs/source/topics/own_crawling_strategy.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index d2fb91520..a99d74b68 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -55,7 +55,7 @@ Using Frontera topics/frontier-canonicalsolvers topics/frontier-backends topics/message_bus - topics/own_crawling_strategy + topics/custom_crawling_strategy topics/scrapy-integration topics/frontera-settings @@ -77,8 +77,8 @@ Using Frontera :doc:`topics/message_bus` Built-in message bus reference. -:doc:`topics/own_crawling_strategy` - Implementing own crawling strategy for distributed backend. +:doc:`topics/custom_crawling_strategy` + Implementing your own crawling strategy. :doc:`topics/scrapy-integration` Learn how to use Frontera with Scrapy. diff --git a/docs/source/topics/custom_crawling_strategy.rst b/docs/source/topics/custom_crawling_strategy.rst new file mode 100644 index 000000000..d09076fab --- /dev/null +++ b/docs/source/topics/custom_crawling_strategy.rst @@ -0,0 +1,200 @@ +================= +Crawling strategy +================= + +Crawling strategy is an essential part of Frontera-based crawler and it's guiding the crawler by instructing it which pages to crawl, when and with what priority. + + +Crawler workflow +================ + +Frontera-based crawler consist of multiple processes, which are running indefinitely. The state in these processes are +persisted to a permanent storage. When processes are stopped the state is flushed and will be loaded next time when +access to certain data item is needed. Therefore it's easy to pause the crawl by stopping the processes, do the +maintenance or modify the code and start again without restarting the crawl from the beginning. + + IMPORTANT DETAIL + Spider log (see http://frontera.readthedocs.io/en/latest/topics/glossary.html) is using hostname-based partitioning. + The content generated from particular host will always land to the same partition (and therefore strategy worker + instance). That guarantees the crawling strategy you design will be always dealing with same subset of hostnames + on every SW instance. It also means the same domain cannot be operated from multiple strategy worker instances. + To get the hostname the 2-nd level domain name is used with public suffix resolved. + + +To restart the crawl the + +* queue contents +* link states +* domain metadata + +needs to be cleaned up. This is usually done by means of truncation of tables. + + +Crawling strategy class +======================= + +It has to be inherited from BaseCrawlingStrategy and implement it's API. + +.. autoclass:: frontera.strategy.BaseCrawlingStrategy + + **Methods** + + .. automethod:: frontera.strategy.BaseCrawlingStrategy.from_worker + .. automethod:: frontera.strategy.BaseCrawlingStrategy.read_seeds + .. automethod:: frontera.strategy.BaseCrawlingStrategy.page_crawled + .. automethod:: frontera.strategy.BaseCrawlingStrategy.filter_extracted_links + .. automethod:: frontera.strategy.BaseCrawlingStrategy.links_extracted + .. automethod:: frontera.strategy.BaseCrawlingStrategy.page_error + .. automethod:: frontera.strategy.BaseCrawlingStrategy.finished + .. automethod:: frontera.strategy.BaseCrawlingStrategy.close + .. automethod:: frontera.strategy.BaseCrawlingStrategy.schedule + .. automethod:: frontera.strategy.BaseCrawlingStrategy.create_request + .. automethod:: frontera.strategy.BaseCrawlingStrategy.refresh_states + + +The class can be put in any module and passed to :term:`strategy worker` or local Scrapy process using command line +option or :setting:`CRAWLING_STRATEGY` setting on startup. + +The strategy class can use it's own storage or any other kind of resources. All items from :term:`spider log` will be +passed through these methods. Scores returned doesn't have to be the same as in method arguments. +Periodically ``finished()`` method is called to check if crawling goal is achieved. + +Workflow +-------- + +There essentially two workflows: seeds addition (or injection) and main workflow. When crawl starts from scratch it +has to run the seed injection first and then proceed with main workflow. When paused/resumed crawler is running +main workflow. + +Seeds addition +^^^^^^^^^^^^^^ + +The purpose of this step is to inject the seeds into the crawler pipeline. The framework allows to process the seeds +stream (which is read from file placed locally or in S3), create requests needed, get their link states, and schedule +them. Once requests are scheduled they will get to the queue and propagate to spiders. + +To enter this workflow user is running strategy worker in add seeds mode providing arguments to crawling strategy +from command line. In particular --seeds-url is used with s3 or local file URL containing seeds to inject. + +1. from_worker() → init() +1. read_seeds(stream from file, None if file isn't present) +1. exit + +It's very convenient to run seeds addition using helper app in Frontera:: + + $ python -m frontera.utils.add_seeds --config ... --seeds-file ... + + +Main +^^^^ + +This is the main cycle used when crawl is in progress. In a nutshell on every spider event the specific handler is +called, depending on the type of event. When strategy worker is getting the SIGTERM signal it's trying to stop politely + by calling close(). In it's normal state it listens for a spider log and executes the event handlers. + +1. from_worker() → init() +1. page_crawled(response) OR page_error(request, error) OR filter_extracted_links(request, links) and subsequent links_extracted(request, links) +1. close() +1. exit + +Scheduling and creating requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ultimate goal of crawling strategy is scheduling of requests. To schedule request there is a method +schedule(request, score). The request is an instance of :class:`Request ` class and is +often available from arguments of event handlers: _page_crawled_, _page_error_ and _links_extracted_, or can be created + on-demand using _create_request()_ method. + + IMPORTANT NOTICE + + The request created with create_request() is lack of state (meta[b'state']) set. To get the states strategy worker + needs to access the backend, and this is not happenning when you call create_request(). Instead it is expected you + will create a batch of requests and call refresh_states(iterable) on the whole batch of requests. After + refresh_states is done, you will have a states available for your newly created requests. + + The Request objects created by strategy worker for event handlers always have the states assigned. + +State operations +^^^^^^^^^^^^^^^^ + +Every link has a state. The purpose of this states is to allow the developer to persist the state of the link in the +system (allow restart of SW components without data loss) and use it for decision making. The states are cached in +strategy worker, flushed to backend and will be loaded when needed. States can have following values: + +* NOT_CRAWLED, +* QUEUED, +* CRAWLED, +* ERROR + +NOT_CRAWLED is assigned when link is new, and wasn't seen previously, the rest of the state values must be assigned +in the crawling strategy code. + +States allow to implement such logic as: + +* Basic visit once of every link found, +* Revisiting by time condition, if state is coupled with a timestamp (requires minor modification of backend), +* Re-visiting of errored links depending on the type of error (fatal errors are skipped, and recoverable are revisited). +* Analysis of the states database to collect the state stats using Hadoop jobs. + +See also + +https://github.com/scrapinghub/frontera/blob/master/frontera/core/components.py#L105 + + +Components +========== + +There are certain building blocks and successful solutions exist for the common problems. + +DomainCache +----------- + +It's often needed to persist per-host metadata in the permanent storage. To solve this there is a DomainCache available +at class path frontera.contrib.backends.hbase.domaincache. It's has an interface of Python mapping types +(https://docs.python.org/3/library/stdtypes.html?highlight=mapping#mapping-types-dict) and is backed by two generations +of in-memory cache with LRU logic and persisted in HBase only (currently). It's expected that one will be using +domain names as keys and dicts as values. It's convenient to store there per-domin statistics, ban states, the count +of links found, etc. + + +PublicSuffix +------------ +When crawling multiple domains (especially unknown ones) it's important to resolve the 2-nd level domain name properly +using publicsuffix. + +Is a library from publicsuffix module provided by https://publicsuffix.org/. The purpose is to maintain a publicsuffix +of ccTLDs and name resolution routines for them in a single library. For us it's convenient to use these library +everywhere where domain name resolution is needed. Here are few examples: + +* www.london.co.uk → london.co.uk +* images.yandex.ru → yandex.ru +* t.co → t.co + + As you may see the number of dots of reverted domain name cannot be used for domain name resolution. + +Useful details +============== + +Debugging crawling strategy +--------------------------- +The best approach I found is to log all the events and outcomes using Python native logging. I.e. to setup the logger +for crawling strategy class and use it. When debug output is needed you will be able to set the logger to output to +a file, with a specific format and log level. After you have logging output set up you should start the crawl of +problematic website locally, collect and analyse the log output. + +Other approaches include analysis of links database, inspecting of domain metadata and states tables, collecting the +log output of link states changes (experimental SW feature). + +Meta fields +----------- + +== ======= =========== ========= +# name description presence +== ======= =========== ========= +1 b"slot" Queue partitioning key in bytes, highest priority. Use it if your app requires partitioning other than default 2-nd level domain-based partitioning Optional +2 b"domain" Dict generated by Frontera DomainMiddleware, and containing parsed domain name Always +3 b"state" Integer representing the link state, set by strategy worker. Link states are defined in frontera.core.components.States Always +4 b"encoding" In response, for HTML, encoding detected by Scrapy Optional +5 b"scrapy_meta" When scheduling can be used to set meta field for Scrapy Optional + +Keys and string types in nested structures are always bytes. diff --git a/docs/source/topics/own_crawling_strategy.rst b/docs/source/topics/own_crawling_strategy.rst deleted file mode 100644 index 6067105eb..000000000 --- a/docs/source/topics/own_crawling_strategy.rst +++ /dev/null @@ -1,26 +0,0 @@ -================= -Crawling strategy -================= - -Use ``cluster`` example and ``frontera.worker.strategies.bfs`` module for reference. In general, you need to write a -crawling strategy class by subclassing: - -.. autoclass:: frontera.worker.strategies.BaseCrawlingStrategy - - **Methods** - - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.from_worker - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.add_seeds - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_crawled - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_error - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.finished - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.close - - -The class can be put in any module and passed to :term:`strategy worker` using command line option or -:setting:`CRAWLING_STRATEGY` setting on startup. - -The strategy class instantiated in strategy worker, and can use it's own storage or any other kind of resources. All -items from :term:`spider log` will be passed through these methods. Scores returned doesn't have to be the same as in -method arguments. Periodically ``finished()`` method is called to check if crawling goal is achieved. - diff --git a/docs/source/topics/quick-start-single.rst b/docs/source/topics/quick-start-single.rst index 6690f686b..6bbe8d18d 100644 --- a/docs/source/topics/quick-start-single.rst +++ b/docs/source/topics/quick-start-single.rst @@ -6,7 +6,6 @@ The idea is that you develop and debug crawling strategy in single process mode deploying crawling strategy for crawling in production at scale. Single process is also good as a first step to get something running quickly. - 1. Create your Scrapy spider ============================ @@ -46,39 +45,45 @@ See :doc:`installation`. This article about :doc:`integration with Scrapy ` explains this step in detail. +4. Choose your crawling strategy +================================ -4. Choose your backend -====================== +Here are the options you would need to redefine when running in single process mode the crawler configured for +distributed mode:: -Configure frontier settings to use a built-in backend like in-memory BFS:: + # these two parameters are pointing Frontera that it will run locally - BACKEND = 'frontera.contrib.backends.memory.BFS' + SPIDER_FEED_PARTITIONS = 1 + SPIDER_LOG_PARTITIONS = 1 + STRATEGY = "frontera.strategy.basic.BasicCrawlingStrategy" -5. Choose the crawling strategy -=============================== -Here are the options you would need to redefine when running in single process mode the crawler configured for -distributed mode: +5. Choose your backend +====================== + +Configure frontier settings to use a built-in backend like:: + + BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' -# all your distributed options, probably imported from other modules -SPIDER_FEED_PARTITIONS = 1 -SPIDER_LOG_PARTITIONS = 1 +6. Inject the seed URLs +======================= -STRATEGY = "workers.strategy.Contacts" +This step is required only if your crawling strategy requires seeds injection from external source.:: -This config will be used by Scrapy, instead of distributed crawler config. + $ python -m frontera.worker.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] +After script is finished succesfully your seeds should be stored in backend's queue and scheduled for crawling. -5. Run the spider +7. Run the spider ================= Run your Scrapy spider as usual from the command line:: scrapy crawl myspider -And that's it! You got your spider running integrated with Frontera. +And that's it! You got your crawler running integrated with Frontera. What else? ========== diff --git a/docs/source/topics/scrapy-integration.rst b/docs/source/topics/scrapy-integration.rst index 56422ea70..ff3a8bee6 100644 --- a/docs/source/topics/scrapy-integration.rst +++ b/docs/source/topics/scrapy-integration.rst @@ -2,8 +2,8 @@ Using the Frontier with Scrapy ============================== -Using Frontera is quite easy, it includes a set of `Scrapy middlewares`_ and Scrapy scheduler that encapsulates -Frontera usage and can be easily configured using `Scrapy settings`_. +To use Frontera with Scrapy, you will need to add `Scrapy middlewares`_ and redefine the default Scrapy scheduler with +custom Frontera scheduler. Both can be done by modifying `Scrapy settings`_. Activating the frontier @@ -32,7 +32,6 @@ Create a Frontera ``settings.py`` file and add it to your Scrapy settings:: Another option is to put these settings right into Scrapy settings module. - Organizing files ================ @@ -43,8 +42,6 @@ When using frontier with a Scrapy project, we propose the following directory st frontera/ __init__.py settings.py - middlewares.py - backends.py spiders/ ... __init__.py @@ -54,8 +51,6 @@ When using frontier with a Scrapy project, we propose the following directory st These are basically: - ``my_scrapy_project/frontera/settings.py``: the Frontera settings file. -- ``my_scrapy_project/frontera/middlewares.py``: the middlewares used by the Frontera. -- ``my_scrapy_project/frontera/backends.py``: the backend(s) used by the Frontera. - ``my_scrapy_project/spiders``: the Scrapy spiders folder - ``my_scrapy_project/settings.py``: the Scrapy settings file - ``scrapy.cfg``: the Scrapy config file @@ -124,12 +119,6 @@ Configuration guidelines There several tunings you can make for efficient broad crawling. -Adding one of seed loaders for bootstrapping of crawling process:: - - SPIDER_MIDDLEWARES.update({ - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, - }) - Various settings suitable for broad crawling:: HTTPCACHE_ENABLED = False # Turns off disk cache, which has low hit ratio during broad crawls @@ -160,65 +149,4 @@ Check also `Scrapy broad crawling`_ recommendations. .. _`Quick start single process`: http://frontera.readthedocs.org/en/latest/topics/quick-start-single.html .. _`Scrapy broad crawling`: http://doc.scrapy.org/en/master/topics/broad-crawls.html - - -Scrapy Seed Loaders -=================== - -Frontera has some built-in Scrapy middlewares for seed loading. - -Seed loaders use the ``process_start_requests`` method to generate requests from a source that are added later to the -:class:`FrontierManager `. - - -Activating a Seed loader ------------------------- - -Just add the Seed Loader middleware to the ``SPIDER_MIDDLEWARES`` scrapy settings:: - - SPIDER_MIDDLEWARES.update({ - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 650 - }) - - -.. _seed_loader_file: - -FileSeedLoader --------------- - -Load seed URLs from a file. The file must be formatted contain one URL per line:: - - http://www.asite.com - http://www.anothersite.com - ... - -Yo can disable URLs using the ``#`` character:: - - ... - #http://www.acommentedsite.com - ... - -**Settings**: - -- ``SEEDS_SOURCE``: Path to the seeds file - - -.. _seed_loader_s3: - -S3SeedLoader ------------- - -Load seeds from a file stored in an Amazon S3 bucket - -File format should the same one used in :ref:`FileSeedLoader `. - -Settings: - -- ``SEEDS_SOURCE``: Path to S3 bucket file. eg: ``s3://some-project/seed-urls/`` - -- ``SEEDS_AWS_ACCESS_KEY``: S3 credentials Access Key - -- ``SEEDS_AWS_SECRET_ACCESS_KEY``: S3 credentials Secret Access Key - - .. _`Scrapy Middleware doc`: http://doc.scrapy.org/en/latest/topics/spider-middleware.html From df214c680bc946762e78c6307cae97f80573255d Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 12 Jul 2018 19:25:27 +0500 Subject: [PATCH 194/273] removed obsolete test --- tests/test_seed_loader.py | 127 -------------------------------------- 1 file changed, 127 deletions(-) delete mode 100644 tests/test_seed_loader.py diff --git a/tests/test_seed_loader.py b/tests/test_seed_loader.py deleted file mode 100644 index bc512e2a9..000000000 --- a/tests/test_seed_loader.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -import unittest -from shutil import rmtree -from tempfile import mkdtemp - -from scrapy.spiders import Spider - -from frontera.settings import Settings -from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader, NotConfigured -from frontera.contrib.scrapy.middlewares.seeds.s3 import S3SeedLoader - -from tests.mocks.boto import MockConnection -from tests import mock - - -class TestFileSeedLoader(unittest.TestCase): - - def setUp(self): - self.tmp_path = mkdtemp() - - def tearDown(self): - rmtree(self.tmp_path) - - def seed_loader_setup(self, seeds_content=None): - seed_path = os.path.join(self.tmp_path, 'seeds.txt') - default_content = """ -https://www.example.com -https://www.scrapy.org -""" - seeds_content = seeds_content or default_content - with open(seed_path, 'wb') as tmpl_file: - tmpl_file.write(seeds_content.encode('utf-8')) - assert os.path.isfile(seed_path) # Failure of test itself - settings = Settings() - settings.SEEDS_SOURCE = seed_path - crawler = type('crawler', (object,), {}) - crawler.settings = settings - return FileSeedLoader(crawler) - - def test_seeds_not_configured(self): - crawler = type('crawler', (object,), {}) - crawler.settings = Settings() - self.assertRaises(NotConfigured, FileSeedLoader, crawler) - - def test_load_seeds(self): - seed_loader = self.seed_loader_setup() - seeds = seed_loader.load_seeds() - self.assertEqual(seeds, ['https://www.example.com', 'https://www.scrapy.org']) - - def test_process_start_requests(self): - seed_loader = self.seed_loader_setup() - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) - - def test_process_start_requests_ignore_comments(self): - seeds_content = """ -https://www.example.com -# https://www.dmoz.org -https://www.scrapy.org -# https://www.test.com -""" - seed_loader = self.seed_loader_setup(seeds_content) - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) - - -class TestS3SeedLoader(unittest.TestCase): - - def setUp(self): - self.tmp_path = mkdtemp() - settings = Settings() - settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder' - settings.SEEDS_AWS_ACCESS_KEY = 'access_key' - settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key' - crawler = type('crawler', (object,), {}) - crawler.settings = settings - self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt') - self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt') - s1_content = """ -https://www.example.com -https://www.scrapy.org -""" - s2_content = """ -https://www.dmoz.org -https://www.test.com -""" - - with open(self.seed_path_1, 'wb') as tmpl_file: - tmpl_file.write(s1_content.encode('utf-8')) - with open(self.seed_path_2, 'wb') as tmpl_file: - tmpl_file.write(s2_content.encode('utf-8')) - self.seed_loader = S3SeedLoader(crawler) - - def tearDown(self): - rmtree(self.tmp_path) - - def test_invalid_s3_seed_source(self): - crawler = type('crawler', (object,), {}) - settings = Settings() - settings.SEEDS_SOURCE = 'invalid_url' - crawler.settings = settings - self.assertRaises(NotConfigured, S3SeedLoader, crawler) - - def test_process_start_requests(self): - urls = ['https://www.example.com', 'https://www.scrapy.org', - 'https://www.dmoz.org', 'https://www.test.com'] - self.check_request_urls(urls) - - def test_s3_loader_ignores_non_txt_files(self): - urls = [] - self.check_request_urls(urls, '.ini') - - def check_request_urls(self, urls, key_extension='.txt'): - with open(self.seed_path_1, 'rU') as s1: - with open(self.seed_path_2, 'rU') as s2: - conn = MockConnection() - bucket = conn.create_bucket('some-bucket') - bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1) - bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2) - - def mocked_connect_s3(*args, **kwargs): - return conn - - with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3', - side_effect=mocked_connect_s3): - requests = self.seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual(set([r.url for r in requests]), set(urls)) From ebd85329dd47c702f031b4ef99eaec86320fd287 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 12 Jul 2018 19:53:54 +0500 Subject: [PATCH 195/273] new path for a crawling strategy --- frontera/settings/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index bb2c4898b..1285b1ff3 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -68,7 +68,7 @@ STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False -STRATEGY = 'frontera.worker.strategies.basic.BasicCrawlingStrategy' +STRATEGY = 'frontera.strategy.basic.BasicCrawlingStrategy' STRATEGY_ARGS = {} SW_FLUSH_INTERVAL = 300 TEST_MODE = False From b37e7d2525a4734cc2711a6d906516cd98ff68c6 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Thu, 12 Jul 2018 20:18:35 +0500 Subject: [PATCH 196/273] style --- frontera/utils/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 750c5dcb7..90479e819 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -6,7 +6,7 @@ class FrontierManagerWrapper(object): def __init__(self, settings, manager=None): if manager is None: - manager = LocalFrontierManager if settings.get("LOCAL_MODE") == True else SpiderFrontierManager + manager = LocalFrontierManager if settings.get("LOCAL_MODE") is True else SpiderFrontierManager self.manager = manager.from_settings(settings) self.request_converter = None self.response_converter = None From bca36a89131ee591b9f11a6a0b32f98d0042f38b Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 13 Jul 2018 13:19:16 +0500 Subject: [PATCH 197/273] DomainMetadata interface --- frontera/core/components.py | 71 +++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/frontera/core/components.py b/frontera/core/components.py index 76abf81cf..1cda8d215 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -1,6 +1,7 @@ from __future__ import absolute_import -from abc import ABCMeta, abstractmethod, abstractproperty + import six +from abc import ABCMeta, abstractmethod class StartStopMixin(object): @@ -90,8 +91,8 @@ def count(self): @six.add_metaclass(ABCMeta) class States(StartStopMixin): - """Interface definition for a document states management class. This class is responsible for providing actual - documents state, and persist the state changes in batch-oriented manner.""" + """Interface definition for a link states management class. This class is responsible for providing actual + link state, and persist the state changes in batch-oriented manner.""" NOT_CRAWLED = 0 QUEUED = 1 @@ -133,6 +134,53 @@ def fetch(self, fingerprints): raise NotImplementedError +@six.add_metaclass(ABCMeta) +class DomainMetadata(StartStopMixin): + """ + Interface definition for a domain metadata storage. It's main purpose is to store the per-domain metadata using + Python-friendly structures. Meant to be used by crawling strategy to store counters and flags in low level + facilities provided by Backend. + """ + + @abstractmethod + def __setitem__(self, key, value): + """ + Puts key, value tuple in storage. + + :param key: str + :param value: Any + """ + raise NotImplementedError + + @abstractmethod + def __getitem__(self, key): + """ + Retrieves the value associated with the storage. Raises KeyError if key is absent. + + :param key: str + :return value: Any + """ + raise NotImplementedError + + @abstractmethod + def __delitem__(self, key): + """ + Removes the tuple associated with key from storage. Raises KeyError if key is absent. + + :param key: str + """ + raise NotImplementedError + + def __contains__(self, key): + """ + Checks if key is present in the storage. + + :param key: str + :return: boolean + """ + raise NotImplementedError + + @six.add_metaclass(ABCMeta) class Component(Metadata): """ @@ -195,27 +243,38 @@ class CanonicalSolver(Middleware): class PropertiesMixin(object): - @abstractproperty + @property + @abstractmethod def queue(self): """ :return: associated :class:`Queue ` object """ raise NotImplementedError - @abstractproperty + @property + @abstractmethod def metadata(self): """ :return: associated :class:`Metadata ` object """ raise NotImplementedError - @abstractproperty + @property + @abstractmethod def states(self): """ :return: associated :class:`States ` object """ raise NotImplementedError + @property + @abstractmethod + def domain_metadata(self): + """ + :return: associated :class:`DomainMetadata ` object + """ + raise NotImplementedError + @six.add_metaclass(ABCMeta) class Backend(PropertiesMixin, Component): From 524dd33620492bc25e4961c678c6c882d936c7a8 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 13 Jul 2018 13:19:41 +0500 Subject: [PATCH 198/273] using DomainMetadata in hbase DomainCache --- frontera/contrib/backends/hbase/domaincache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index e8eca46d0..e88208ac5 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -8,6 +8,7 @@ from msgpack import packb, unpackb from w3lib.util import to_bytes, to_native_str +from frontera.core.components import DomainMetadata from frontera.contrib.backends.hbase.utils import HardenedBatch from frontera.utils.msgpack import restruct_for_pack @@ -61,7 +62,7 @@ def _update_order(self, key): self.__order[key] = None -class DomainCache(LRUCache): +class DomainCache(LRUCache, DomainMetadata): """ This is an implementation of Domain metadata cache backed by HBase table. It's main purpose is to store the domain metadata in Python-friendly structures while providing fast and reliable access. From 943231b15d0ff4c6f0407c54a5c69fb31c826d81 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 13 Jul 2018 13:20:30 +0500 Subject: [PATCH 199/273] SQLA DomainMetadata storage --- .../contrib/backends/sqlalchemy/__init__.py | 12 +++-- .../contrib/backends/sqlalchemy/components.py | 51 ++++++++++++++++--- .../contrib/backends/sqlalchemy/models.py | 21 ++++++++ .../sqlalchemy/test_domain_metadata.py | 42 +++++++++++++++ 4 files changed, 116 insertions(+), 10 deletions(-) create mode 100644 tests/contrib/backends/sqlalchemy/test_domain_metadata.py diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 810975d67..d007ef1bd 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -6,7 +6,7 @@ from frontera.core.components import DistributedBackend from frontera.contrib.backends import CommonBackend -from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States +from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States, DomainMetadata from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase from frontera.utils.misc import load_object @@ -120,6 +120,7 @@ def __init__(self, manager): self._metadata = None self._queue = None self._states = None + self._domain_metadata = None @classmethod def strategy_worker(cls, manager): @@ -141,6 +142,7 @@ def strategy_worker(cls, manager): session.close() b._states = States(b.session_cls, model, settings.get('STATE_CACHE_SIZE_LIMIT')) + b._domain_metadata = DomainMetadata(b.session_cls) return b @classmethod @@ -185,13 +187,17 @@ def metadata(self): def states(self): return self._states + @property + def domain_metadata(self): + return self._domain_metadata + def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_start() def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_stop() diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index ab99a0fac..68c0e4493 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -1,18 +1,19 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import logging -from datetime import datetime + from time import time, sleep +import logging +import six from cachetools import LRUCache -from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from datetime import datetime from frontera.contrib.backends.memory import MemoryStates -from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase -from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase, DomainMetadataModel as DomainMetadataKV +from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue, DomainMetadata as BaseDomainMetadata from frontera.core.models import Request, Response from frontera.utils.misc import get_crc32, chunks from frontera.utils.url import parse_domain_from_url_fast -import six from six.moves import range from w3lib.util import to_native_str, to_bytes @@ -38,7 +39,7 @@ def func_wrapper(self, *args, **kwargs): class Metadata(BaseMetadata): def __init__(self, session_cls, model_cls, cache_size): - self.session = session_cls(expire_on_commit=False) # FIXME: Should be explicitly mentioned in docs + self.session = session_cls(expire_on_commit=False) self.model = model_cls self.table = DeclarativeBase.metadata.tables['metadata'] self.cache = LRUCache(cache_size) @@ -275,3 +276,39 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self.session.delete(item) self.session.commit() return results + + +class DomainMetadata(BaseDomainMetadata): + def __init__(self, session_cls): + self.session = session_cls(expire_on_commit=False) + self.table = DeclarativeBase.metadata.tables['domain_metadata'] + self.logger = logging.getLogger("sqlalchemy.domain_metadata") + + def frontier_stop(self): + self.session.close() + + @retry_and_rollback + def __setitem__(self, key, value): + pair = DomainMetadataKV(key=key, value=value) + self.session.merge(pair) + self.session.commit() + + @retry_and_rollback + def __getitem__(self, key): + result = self.session.query(DomainMetadataKV).filter(DomainMetadataKV.key == key).first() + if result is None: + raise KeyError + return result.value + + @retry_and_rollback + def __contains__(self, key): + result = self.session.query(DomainMetadataKV.key).filter(DomainMetadataKV.key == key).first() + if result is not None: + return True + return False + + @retry_and_rollback + def __delitem__(self, key): + self.session.query(DomainMetadataKV).filter(DomainMetadataKV.key == key).delete(synchronize_session=False) + self.session.commit() + diff --git a/frontera/contrib/backends/sqlalchemy/models.py b/frontera/contrib/backends/sqlalchemy/models.py index 8211d21c6..8be5fb0af 100644 --- a/frontera/contrib/backends/sqlalchemy/models.py +++ b/frontera/contrib/backends/sqlalchemy/models.py @@ -90,3 +90,24 @@ def query(cls, session): def __repr__(self): return '' % (self.url, self.id) + + +class DomainMetadataModel(DeclarativeBase): + __tablename__ = 'domain_metadata' + __table_args__ = ( + { + 'mysql_charset': 'utf8', + 'mysql_engine': 'InnoDB', + 'mysql_row_format': 'DYNAMIC', + }, + ) + + key = Column(String(256), primary_key=True, nullable=False) + value = Column(PickleType()) + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.key) \ No newline at end of file diff --git a/tests/contrib/backends/sqlalchemy/test_domain_metadata.py b/tests/contrib/backends/sqlalchemy/test_domain_metadata.py new file mode 100644 index 000000000..6f77f70a8 --- /dev/null +++ b/tests/contrib/backends/sqlalchemy/test_domain_metadata.py @@ -0,0 +1,42 @@ +from frontera.contrib.backends.sqlalchemy.components import DomainMetadata, DomainMetadataKV +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from unittest import TestCase +import random +import string + + +def random_string(N): + return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) + + +class TestSqlAlchemyDomainMetadata(TestCase): + def setUp(self): + self.engine = create_engine("sqlite:///:memory:") + self.session_cls = sessionmaker() + self.session_cls.configure(bind=self.engine) + DomainMetadataKV.__table__.create(bind=self.engine) + + def test_basic(self): + dm = DomainMetadata(self.session_cls) + value = {"someint": 1, "somefloat": 1, "someblob": b"bytes"} + dm["test"] = value + assert "test" in dm + assert dm["test"] == value + del dm["test"] + assert "test" not in dm + + dm["test"] = 111 + assert "test" in dm + assert dm["test"] == 111 + + def test_many_items(self): + dm = DomainMetadata(self.session_cls) + for i in range(200): + dm["key%d" % i] = random_string(10) + + for i in range(200): + assert "key%d" % i in dm + assert len(dm["key%d" % i]) == 10 + del dm["key%d" % i] + From 7c1272a9de3f4dab78d7461b7ce06da94c83e852 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 13 Jul 2018 13:20:46 +0500 Subject: [PATCH 200/273] moved domain cache test --- tests/{ => contrib/backends/hbase}/test_domain_cache.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{ => contrib/backends/hbase}/test_domain_cache.py (100%) diff --git a/tests/test_domain_cache.py b/tests/contrib/backends/hbase/test_domain_cache.py similarity index 100% rename from tests/test_domain_cache.py rename to tests/contrib/backends/hbase/test_domain_cache.py From f7044ac4f8caaf9db2ad37eb44e5b33c6e758369 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 16 Jul 2018 12:29:34 +0500 Subject: [PATCH 201/273] DomainCache is instantiated in HBaseBackend now --- docs/source/topics/frontera-settings.rst | 31 +++++++++++++++++++++ docs/source/topics/frontier-backends.rst | 3 +- frontera/contrib/backends/hbase/__init__.py | 16 +++++++++-- frontera/settings/default_settings.py | 3 ++ 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 0bd8c3a03..db7ab90bb 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -591,6 +591,37 @@ Default: ``False`` Enables dropping and creation of new HBase tables on worker start. +.. setting:: HBASE_DOMAIN_METADATA_TABLE + +HBASE_DOMAIN_METADATA_TABLE +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``domain_metadata`` + +Name of the domain metadata table in HBase. + + +.. setting:: HBASE_DOMAIN_METADATA_CACHE_SIZE + +HBASE_DOMAIN_METADATA_CACHE_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: 1000 + +The count of domain-value pairs cached in memory in :term:`strategy-worker`. Pairs are evicted from cache using LRU +policy. + + +.. setting:: HBASE_DOMAIN_METADATA_BATCH_SIZE + +HBASE_DOMAIN_METADATA_BATCH_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: 100 + +Maximum count of domain-value pairs kept in write buffer before actual write happens. + + .. setting:: HBASE_METADATA_TABLE HBASE_METADATA_TABLE diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index ad5843378..dec8a6694 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -278,7 +278,8 @@ HBase backend Is more suitable for large scale web crawlers. Settings reference can be found here :ref:`hbase-settings`. Consider tunning a block cache to fit states within one block for average size website. To achieve this it's recommended to use -:attr:`hostname_local_fingerprint ` to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. +:attr:`hostname_local_fingerprint ` to achieve documents +closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. .. TODO: document details of block cache tuning, BC* settings and queue get operation concept, diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index ac10eb12b..0c2a08e05 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -6,6 +6,7 @@ from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.utils.misc import chunks, get_crc32, time_elapsed from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder +from frontera.contrib.backends.hbase.domaincache import DomainCache from happybase import Connection from msgpack import Unpacker, Packer, packb @@ -495,6 +496,7 @@ def __init__(self, manager): self._metadata = None self._queue = None self._states = None + self._domain_metadata = None def _init_states(self, settings): self._states = HBaseState(connection=self.connection, @@ -515,10 +517,16 @@ def _init_metadata(self, settings): settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) + def _init_domain_metadata(self, settings): + self._domain_metadata = DomainCache(settings.get('HBASE_DOMAIN_METADATA_CACHE_SIZE'), self.connection, + settings.get('HBASE_DOMAIN_METADATA_TABLE'), + batch_size=settings.get('HBASE_DOMAIN_METADATA_BATCH_SIZE')) + @classmethod def strategy_worker(cls, manager): o = cls(manager) o._init_states(manager.settings) + o._init_domain_metadata(manager.settings) return o @classmethod @@ -547,13 +555,17 @@ def queue(self): def states(self): return self._states + @property + def domain_metadata(self): + return self._domain_metadata + def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_start() def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_stop() self.connection.close() diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 1285b1ff3..ab3ff87b8 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -16,6 +16,9 @@ HBASE_THRIFT_PORT = 9090 HBASE_NAMESPACE = 'crawler' HBASE_DROP_ALL_TABLES = False +HBASE_DOMAIN_METADATA_TABLE = 'domain_metadata' +HBASE_DOMAIN_METADATA_CACHE_SIZE = 1000 +HBASE_DOMAIN_METADATA_BATCH_SIZE = 100 HBASE_METADATA_TABLE = 'metadata' HBASE_STATES_TABLE = 'states' HBASE_USE_SNAPPY = False From 1f1f4a14d02509091e308c422efc33f29f9e9bae Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 16 Jul 2018 12:40:38 +0500 Subject: [PATCH 202/273] removing old memory backends --- frontera/contrib/backends/memory/__init__.py | 163 +----------------- .../backends/memory/test_backend_memory.py | 31 ---- 2 files changed, 5 insertions(+), 189 deletions(-) delete mode 100644 tests/contrib/backends/memory/test_backend_memory.py diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index d3db9a75c..d8b77477c 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -1,15 +1,13 @@ from __future__ import absolute_import -import logging -import random -from collections import deque, Iterable -from frontera.contrib.backends import CommonBackend +from collections import Iterable + +import logging +import six +from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.core.components import Metadata, Queue, States, DistributedBackend -from frontera.core import OverusedBuffer from frontera.utils.heap import Heap -from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.utils.url import parse_domain_from_url_fast -import six from six.moves import range @@ -81,45 +79,6 @@ def _compare_pages(self, first, second): return cmp(first.meta[b'_scr'], second.meta[b'_scr']) -class MemoryDequeQueue(Queue): - def __init__(self, partitions, is_fifo=True): - """ - Deque-based queue (see collections module). Efficient queue for LIFO and FIFO strategies. - :param partitions: int count of partitions - :param type: bool, True for FIFO, False for LIFO - """ - self.partitions = [i for i in range(0, partitions)] - self.partitioner = Crc32NamePartitioner(self.partitions) - self.logger = logging.getLogger("memory.dequequeue") - self.queues = {} - self.is_fifo = is_fifo - for partition in self.partitions: - self.queues[partition] = deque() - - def count(self): - return sum([len(h) for h in six.itervalues(self.queues)]) - - def get_next_requests(self, max_n_requests, partition_id, **kwargs): - batch = [] - pop_op = self.queues[partition_id].popleft if self.is_fifo else self.queues[partition_id].pop - while max_n_requests > 0 and self.queues[partition_id]: - batch.append(pop_op()) - max_n_requests -= 1 - return batch - - def schedule(self, batch): - for fprint, score, request, schedule in batch: - if schedule: - request.meta[b'_scr'] = score - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) - partition_id = self.partitions[0] - else: - partition_id = self.partitioner.partition(hostname, self.partitions) - self.queues[partition_id].append(request) - - class MemoryStates(States): def __init__(self, cache_size_limit): @@ -151,111 +110,6 @@ def flush(self): self._cache.clear() -class MemoryBaseBackend(CommonBackend): - """ - Base class for in-memory heapq Backend objects. - """ - component_name = 'Memory Base Backend' - - def __init__(self, manager): - self.manager = manager - settings = manager.settings - self._metadata = MemoryMetadata() - self._states = MemoryStates(settings.get("STATE_CACHE_SIZE")) - self._queue = self._create_queue(settings) - self._id = 0 - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - - @property - def queue(self): - return self._queue - - @classmethod - def from_manager(cls, manager): - return cls(manager) - - def _create_queue(self, settings): - return MemoryQueue(1) - - def add_seeds(self, seeds): - for seed in seeds: - seed.meta[b'id'] = self._id - self._id += 1 - super(MemoryBaseBackend, self).add_seeds(seeds) - - def links_extracted(self, request, links): - for link in links: - link.meta[b'id'] = self._id - self._id += 1 - super(MemoryBaseBackend, self).links_extracted(request, links) - - def finished(self): - return self.queue.count() == 0 - - -class MemoryDFSQueue(MemoryQueue): - def _compare_pages(self, first, second): - return cmp((second.meta[b'depth'], first.meta[b'id']), - (first.meta[b'depth'], second.meta[b'id'])) - - -class MemoryBFSQueue(MemoryQueue): - def _compare_pages(self, first, second): - return cmp((first.meta[b'depth'], first.meta[b'id']), - (second.meta[b'depth'], second.meta[b'id'])) - - -class MemoryRandomQueue(MemoryQueue): - def _compare_pages(self, first, second): - return random.choice([-1, 0, 1]) - - -class MemoryFIFOBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryLIFOBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS'), is_fifo=False) - - -class MemoryDFSBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryBFSBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryBFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryRandomBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryRandomQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryDFSOverusedBackend(MemoryDFSBackend): - def __init__(self, manager): - super(MemoryDFSOverusedBackend, self).__init__(manager) - settings = manager.settings - self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests, - settings.get("OVERUSED_MAX_PER_KEY"), - settings.get("OVERUSED_KEEP_PER_KEY"), - settings.get("OVERUSED_MAX_KEYS"), - settings.get("OVERUSED_KEEP_KEYS")) - - def get_next_requests(self, max_next_requests, **kwargs): - return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) - - class MemoryDistributedBackend(DistributedBackend): def __init__(self, manager): settings = manager.settings @@ -309,10 +163,3 @@ def strategy_worker(cls, manager): def db_worker(cls, manager): return cls(manager) - -BASE = MemoryBaseBackend -FIFO = MemoryFIFOBackend -LIFO = MemoryLIFOBackend -DFS = MemoryDFSBackend -BFS = MemoryBFSBackend -RANDOM = MemoryRandomBackend diff --git a/tests/contrib/backends/memory/test_backend_memory.py b/tests/contrib/backends/memory/test_backend_memory.py deleted file mode 100644 index 4b1c6cf79..000000000 --- a/tests/contrib/backends/memory/test_backend_memory.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import absolute_import -from tests.test_overused_buffer import DFSOverusedBackendTest -from tests import backends - - -class TestFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.FIFO' - - -class TestLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.LIFO' - - -class TestDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.DFS' - - -class TestDFSOverused(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' - - -class TestDFSOverusedSimulation(DFSOverusedBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' - - -class TestBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.BFS' - - -class TestRANDOM(backends.RANDOMBackendTest): - backend_class = 'frontera.contrib.backends.memory.RANDOM' From 276da483442bd89661a756a1e8e62a96f2d8d57d Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 16 Jul 2018 14:56:52 +0500 Subject: [PATCH 203/273] removing old sqlalchemy backends --- frontera/contrib/backends/__init__.py | 85 -------- .../contrib/backends/sqlalchemy/__init__.py | 105 +-------- .../contrib/backends/sqlalchemy/revisiting.py | 136 ------------ .../sqlalchemy/test_backend_sqlalchemy.py | 206 ------------------ 4 files changed, 4 insertions(+), 528 deletions(-) delete mode 100644 frontera/contrib/backends/sqlalchemy/revisiting.py delete mode 100644 tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 2dc89a1ee..40a96afc6 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -1,86 +1 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import -from collections import OrderedDict - -from frontera import Backend -from frontera.core.components import States - - -class CommonBackend(Backend): - """ - A simpliest possible backend, performing one-time crawl: if page was crawled once, it will not be crawled again. - """ - component_name = 'Common Backend' - - @classmethod - def from_manager(cls, manager): - return cls(manager) - - def frontier_start(self): - self.metadata.frontier_start() - self.queue.frontier_start() - self.states.frontier_start() - self.queue_size = self.queue.count() - - def frontier_stop(self): - self.metadata.frontier_stop() - self.queue.frontier_stop() - self.states.frontier_stop() - - def add_seeds(self, seeds): - for seed in seeds: - seed.meta[b'depth'] = 0 - self.metadata.add_seeds(seeds) - self.states.fetch([seed.meta[b'fingerprint'] for seed in seeds]) - self.states.set_states(seeds) - self._schedule(seeds) - self.states.update_cache(seeds) - - def _schedule(self, requests): - batch = [] - queue_incr = 0 - for request in requests: - schedule = True if request.meta[b'state'] in [States.NOT_CRAWLED, States.ERROR, None] else False - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, schedule)) - if schedule: - queue_incr += 1 - request.meta[b'state'] = States.QUEUED - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += queue_incr - - def _get_score(self, obj): - return obj.meta.get(b'score', 1.0) - - def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions - batch = [] - for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) - self.queue_size -= len(batch) - return batch - - def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED - self.states.update_cache(response) - self.metadata.page_crawled(response) - - def links_extracted(self, request, links): - to_fetch = OrderedDict() - for link in links: - to_fetch[link.meta[b'fingerprint']] = link - link.meta[b'depth'] = request.meta.get(b'depth', 0)+1 - self.states.fetch(to_fetch.keys()) - self.states.set_states(links) - unique_links = to_fetch.values() - self.metadata.links_extracted(request, unique_links) - self._schedule(unique_links) - self.states.update_cache(unique_links) - - def request_error(self, request, error): - request.meta[b'state'] = States.ERROR - self.metadata.request_error(request, error) - self.states.update_cache(request) - - def finished(self): - return self.queue_size == 0 diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index d007ef1bd..391c9f659 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -1,109 +1,12 @@ from __future__ import absolute_import -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from sqlalchemy.engine.reflection import Inspector - -from frontera.core.components import DistributedBackend -from frontera.contrib.backends import CommonBackend from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States, DomainMetadata from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase +from frontera.core.components import DistributedBackend from frontera.utils.misc import load_object - - -class SQLAlchemyBackend(CommonBackend): - def __init__(self, manager): - self.manager = manager - settings = manager.settings - engine = settings.get('SQLALCHEMYBACKEND_ENGINE') - engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') - drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - models = settings.get('SQLALCHEMYBACKEND_MODELS') - - self.engine = create_engine(engine, echo=engine_echo) - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) - - if drop_all_tables: - DeclarativeBase.metadata.drop_all(self.engine) - DeclarativeBase.metadata.create_all(self.engine) - - self.session_cls = sessionmaker() - self.session_cls.configure(bind=self.engine) - - if clear_content: - session = self.session_cls() - for name, table in DeclarativeBase.metadata.tables.items(): - session.execute(table.delete()) - session.commit() - session.close() - self._metadata = Metadata(self.session_cls, self.models['MetadataModel'], - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - self._states = States(self.session_cls, self.models['StateModel'], - settings.get('STATE_CACHE_SIZE_LIMIT')) - self._queue = self._create_queue(settings) - - def frontier_stop(self): - super(SQLAlchemyBackend, self).frontier_stop() - self.engine.dispose() - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - - -class FIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy FIFO Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created') - - -class LIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy LIFO Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created_desc') - - -class DFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy DFS Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - def _get_score(self, obj): - return -obj.meta[b'depth'] - - -class BFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy BFS Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - def _get_score(self, obj): - return obj.meta[b'depth'] - - -BASE = CommonBackend -LIFO = LIFOBackend -FIFO = FIFOBackend -DFS = DFSBackend -BFS = BFSBackend +from sqlalchemy import create_engine +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.orm import sessionmaker class Distributed(DistributedBackend): diff --git a/frontera/contrib/backends/sqlalchemy/revisiting.py b/frontera/contrib/backends/sqlalchemy/revisiting.py deleted file mode 100644 index f079756d5..000000000 --- a/frontera/contrib/backends/sqlalchemy/revisiting.py +++ /dev/null @@ -1,136 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -import logging -from datetime import datetime, timedelta -from time import time, sleep -from calendar import timegm - -from sqlalchemy import Column, BigInteger - -from frontera import Request -from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.contrib.backends.sqlalchemy import SQLAlchemyBackend -from frontera.contrib.backends.sqlalchemy.models import QueueModelMixin, DeclarativeBase -from frontera.core.components import Queue as BaseQueue, States -from frontera.utils.misc import get_crc32 -from frontera.utils.url import parse_domain_from_url_fast -from six.moves import range - - -def utcnow_timestamp(): - d = datetime.utcnow() - return timegm(d.timetuple()) - - -class RevisitingQueueModel(QueueModelMixin, DeclarativeBase): - __tablename__ = 'revisiting_queue' - - crawl_at = Column(BigInteger, nullable=False) - - -def retry_and_rollback(func): - def func_wrapper(self, *args, **kwargs): - tries = 5 - while True: - try: - return func(self, *args, **kwargs) - except Exception as exc: - self.logger.exception(exc) - self.session.rollback() - sleep(5) - tries -= 1 - if tries > 0: - self.logger.info("Tries left %i" % tries) - continue - else: - raise exc - return func_wrapper - - -class RevisitingQueue(BaseQueue): - def __init__(self, session_cls, queue_cls, partitions): - self.session = session_cls() - self.queue_model = queue_cls - self.logger = logging.getLogger("sqlalchemy.revisiting.queue") - self.partitions = [i for i in range(0, partitions)] - self.partitioner = Crc32NamePartitioner(self.partitions) - - def frontier_stop(self): - self.session.close() - - def get_next_requests(self, max_n_requests, partition_id, **kwargs): - results = [] - try: - for item in self.session.query(self.queue_model).\ - filter(RevisitingQueueModel.crawl_at <= utcnow_timestamp(), - RevisitingQueueModel.partition_id == partition_id).\ - limit(max_n_requests): - method = 'GET' if not item.method else item.method - results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, - cookies=item.cookies)) - self.session.delete(item) - self.session.commit() - except Exception as exc: - self.logger.exception(exc) - self.session.rollback() - return results - - @retry_and_rollback - def schedule(self, batch): - to_save = [] - for fprint, score, request, schedule in batch: - if schedule: - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) - partition_id = self.partitions[0] - host_crc32 = 0 - else: - partition_id = self.partitioner.partition(hostname, self.partitions) - host_crc32 = get_crc32(hostname) - schedule_at = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else utcnow_timestamp() - q = self.queue_model(fingerprint=fprint, score=score, url=request.url, meta=request.meta, - headers=request.headers, cookies=request.cookies, method=request.method, - partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6, - crawl_at=schedule_at) - to_save.append(q) - request.meta[b'state'] = States.QUEUED - self.session.bulk_save_objects(to_save) - self.session.commit() - - @retry_and_rollback - def count(self): - return self.session.query(self.queue_model).count() - - -class Backend(SQLAlchemyBackend): - """ - DEPRECATED, and will be removed in the next versions. Revisiting is meant to be implemented as part of - crawling strategy. - """ - - def _create_queue(self, settings): - self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") - assert isinstance(self.interval, timedelta) - self.interval = self.interval.total_seconds() - return RevisitingQueue(self.session_cls, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) - - def _schedule(self, requests): - batch = [] - for request in requests: - if request.meta[b'state'] in [States.NOT_CRAWLED]: - request.meta[b'crawl_at'] = utcnow_timestamp() - elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: - request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval - else: - continue # QUEUED - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += len(batch) - - def page_crawled(self, response): - super(Backend, self).page_crawled(response) - self.states.set_states(response.request) - self._schedule([response.request]) - self.states.update_cache(response.request) diff --git a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py deleted file mode 100644 index 98ebdc400..000000000 --- a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py +++ /dev/null @@ -1,206 +0,0 @@ -from __future__ import absolute_import -import os - -import pymysql -from psycopg2 import connect -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT - -from tests import backends -from tests.test_revisiting_backend import RevisitingBackendTest - - -#---------------------------------------------------- -# SQAlchemy base classes -#---------------------------------------------------- -class SQLAlchemyFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.FIFO' - - -class SQLAlchemyLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.LIFO' - - -class SQLAlchemyDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.DFS' - - -class SQLAlchemyBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.BFS' - - - -#---------------------------------------------------- -# SQLite Memory -#---------------------------------------------------- -class SQLiteMemory(backends.BackendTest): - - def get_settings(self): - settings = super(SQLiteMemory, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' - return settings - - -class TestSQLiteMemoryFIFO(SQLAlchemyFIFO, SQLiteMemory): - pass - - -class TestSQLiteMemoryLIFO(SQLAlchemyLIFO, SQLiteMemory): - pass - - -class TestSQLiteMemoryDFS(SQLAlchemyDFS, SQLiteMemory): - pass - - -class TestSQLiteMemoryBFS(SQLAlchemyBFS, SQLiteMemory): - pass - - -#---------------------------------------------------- -# SQLite File -#---------------------------------------------------- -class SQLiteFile(backends.BackendTest): - - SQLITE_DB_NAME = 'backend_test.db' - - def get_settings(self): - settings = super(SQLiteFile, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///' + self.SQLITE_DB_NAME - return settings - - def setup_backend(self, method): - self._delete_test_db() - - def teardown_backend(self, method): - self._delete_test_db() - - def _delete_test_db(self): - try: - os.remove(self.SQLITE_DB_NAME) - except OSError: - pass - - -class TestSQLiteFileFIFO(SQLAlchemyFIFO, SQLiteFile): - pass - - -class TestSQLiteFileLIFO(SQLAlchemyLIFO, SQLiteFile): - pass - - -class TestSQLiteFileDFS(SQLAlchemyDFS, SQLiteFile): - pass - - -class TestSQLiteFileBFS(SQLAlchemyBFS, SQLiteFile): - pass - - - -#---------------------------------------------------- -# DB Backend test base -#---------------------------------------------------- -class DBBackendTest(object): - - DB_DATABASE = 'backend_test' - DB_ENGINE = None - DB_HOST = None - DB_USER = None - DB_PASSWORD = None - - def get_settings(self): - settings = super(DBBackendTest, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = self.DB_ENGINE - return settings - - def setup_backend(self, method): - self._delete_database() - self._create_database() - - def teardown_backend(self, method): - self._delete_database() - - def _delete_database(self): - self._execute_sql("DROP DATABASE IF EXISTS %s;" % self.DB_DATABASE) - - def _create_database(self): - self._execute_sql("CREATE DATABASE %s;" % self.DB_DATABASE) - - def _execute_sql(self, sql): - raise NotImplementedError - - -#---------------------------------------------------- -# Mysql -#---------------------------------------------------- -class Mysql(DBBackendTest): - - DB_ENGINE = 'mysql+pymysql://root:@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'root' - DB_PASSWORD = '' - - def _execute_sql(self, sql): - conn = pymysql.connect(host=self.DB_HOST, - user=self.DB_USER, - passwd=self.DB_PASSWORD) - cur = conn.cursor() - cur.execute(sql) - cur.close() - conn.close() - - -class TestMysqlFIFO(Mysql, SQLAlchemyFIFO): - pass - - -class TestMysqlLIFO(Mysql, SQLAlchemyLIFO): - pass - - -class TestMysqlDFS(Mysql, SQLAlchemyDFS): - pass - - -class TestMysqlBFS(Mysql, SQLAlchemyBFS): - pass - - - -#---------------------------------------------------- -# Postgres -#---------------------------------------------------- -class Postgres(DBBackendTest): - - DB_ENGINE = 'postgres://postgres@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'postgres' - DB_PASSWORD = '' - - def _execute_sql(self, sql): - conn = connect(host=self.DB_HOST, - user=self.DB_USER, - password=self.DB_PASSWORD) - conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) - cur = conn.cursor() - cur.execute(sql) - cur.close() - conn.close() - - -class TestPostgresFIFO(Postgres, SQLAlchemyFIFO): - pass - - -class TestPostgresLIFO(Postgres, SQLAlchemyLIFO): - pass - - -class TestPostgresDFS(Postgres, SQLAlchemyDFS): - pass - - -class TestPostgresBFS(Postgres, SQLAlchemyBFS): - pass - From f2ba99c33e35584105890de4476e9eb09234f93d Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 16 Jul 2018 19:11:31 +0500 Subject: [PATCH 204/273] pointing to existing memory backend --- frontera/settings/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index ab3ff87b8..a9dd86a64 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -3,7 +3,7 @@ AUTO_START = True -BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend' BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 From 8865dafe116e3a29d1ec3fb3654dace7ed07d0f1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 20 Jul 2018 16:09:21 +0500 Subject: [PATCH 205/273] backends docs update --- docs/source/topics/frontera-settings.rst | 3 +- docs/source/topics/frontier-backends.rst | 159 ++++++------------- docs/source/topics/glossary.rst | 8 +- frontera/contrib/backends/memory/__init__.py | 5 + 4 files changed, 59 insertions(+), 116 deletions(-) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index db7ab90bb..e5356ea5d 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -551,7 +551,8 @@ Default:: 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' } -This is mapping with SQLAlchemy models used by backends. It is mainly used for customization. +This is mapping with SQLAlchemy models used by backends. It is mainly used for customization. This setting uses a +dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. Revisiting backend diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index dec8a6694..0f3d7d468 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -2,20 +2,21 @@ Backends ======== -Frontier :class:`Backend ` is where the crawling logic/policies lies, essentially a -brain of your crawler. :class:`Queue `, -:class:`Metadata ` and :class:`States ` are classes -where all low level code is meant to be placed, and -Backend opposite, operates on a higher levels. Frontera is bundled with database and in-memory implementations of -Queue, Metadata and States which can be combined in your custom backends or used standalone by directly -instantiating :class:`FrontierManager ` and Backend. - -Backend methods are called by the FrontierManager after +A :class:`DistributedBackend ` is used to separate higher level code +of :term:`crawling strategy` from low level storage API. :class:`Queue `, +:class:`Metadata `, :class:`States ` and + :class:`DomainMetadata ` are inner components of the DistributedBackend. +The latter is meant to instantiate and hold the references to the objects of above mentioned classes. Frontera is +bundled with database and in-memory implementations of Queue, Metadata, States and DomainMetadata which can be combined +in your custom backends or used standalone by directly instantiating specific variant of +:class:`FrontierManager `. + +DistributedBackend methods are called by the FrontierManager after :class:`Middleware `, using hooks for :class:`Request ` and :class:`Response ` processing according to :ref:`frontier data flow `. -Unlike Middleware, that can have many different instances activated, only one Backend can be used per +Unlike Middleware, that can have many different instances activated, only one DistributedBackend can be used per frontier. @@ -24,11 +25,11 @@ frontier. Activating a backend ==================== -To activate the frontier backend component, set it through the :setting:`BACKEND` setting. +To activate the specific backend, set it through the :setting:`BACKEND` setting. Here’s an example:: - BACKEND = 'frontera.contrib.backends.memory.FIFO' + BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend' Keep in mind that some backends may need to be additionally configured through a particular setting. See :ref:`backends documentation ` for more info. @@ -38,9 +39,9 @@ Keep in mind that some backends may need to be additionally configured through a Writing your own backend ======================== -Each backend component is a single Python class inherited from :class:`Backend ` or +Each backend component is a single Python class inherited from :class:`DistributedBackend ` and using one or all of -:class:`Queue`, :class:`Metadata` and :class:`States`. +:class:`Queue`, :class:`Metadata`, :class:`States` and :class:`DomainMetadata`. :class:`FrontierManager` will communicate with active backend through the methods described below. @@ -99,6 +100,8 @@ Backend should communicate with low-level storage by means of these classes: Metadata ^^^^^^^^ +Is used to store the contents of the crawl. + .. autoclass:: frontera.core.components.Metadata **Methods** @@ -115,6 +118,8 @@ Known implementations are: :class:`MemoryMetadata` and :class:`sqlalchemy.compon Queue ^^^^^ +Is a priority queue and used to persist requests scheduled for crawling. + .. autoclass:: frontera.core.components.Queue **Methods** @@ -130,6 +135,9 @@ Known implementations are: :class:`MemoryQueue` and :class:`sqlalchemy.component States ^^^^^^ +Is a storage used for checking and storing the link states. Where state is a short integer of one of states descibed in +:class:`frontera.core.components.States`. + .. autoclass:: frontera.core.components.States **Methods** @@ -145,69 +153,45 @@ States Known implementations are: :class:`MemoryStates` and :class:`sqlalchemy.components.States`. +DomainMetadata +^^^^^^^^^^^^^^ -.. _frontier-built-in-backend: - -Built-in backend reference -========================== - -This article describes all backend components that come bundled with Frontera. - -To know the default activated :class:`Backend ` check the -:setting:`BACKEND` setting. - +Is used to store per-domain flags, counters or even robots.txt contents to help :term:`crawling strategy` maintain +features like per-domain number of crawled pages limit or automatic banning. -.. _frontier-backends-basic-algorithms: +.. autoclass:: frontera.core.components.DomainMetadata -Basic algorithms -^^^^^^^^^^^^^^^^ -Some of the built-in :class:`Backend ` objects implement basic algorithms such -as `FIFO`_/`LIFO`_ or `DFS`_/`BFS`_ for page visit ordering. - -Differences between them will be on storage engine used. For instance, -:class:`memory.FIFO ` and -:class:`sqlalchemy.FIFO ` will use the same logic but with different -storage engines. - -All these backend variations are using the same :class:`CommonBackend ` class -implementing one-time visit crawling policy with priority queue. - -.. autoclass:: frontera.contrib.backends.CommonBackend - - -.. _frontier-backends-memory: + **Methods** -Memory backends -^^^^^^^^^^^^^^^ + .. automethod:: frontera.core.components.DomainMetadata.__setitem__ -This set of :class:`Backend ` objects will use an `heapq`_ module as queue and native -dictionaries as storage for :ref:`basic algorithms `. + .. automethod:: frontera.core.components.DomainMetadata.__getitem__ + .. automethod:: frontera.core.components.DomainMetadata.__delitem__ -.. class:: frontera.contrib.backends.memory.BASE + .. automethod:: frontera.core.components.DomainMetadata.__contains__ - Base class for in-memory :class:`Backend ` objects. -.. class:: frontera.contrib.backends.memory.FIFO +Known implementations are: native dict and :class:`sqlalchemy.components.DomainMetadata`. - In-memory :class:`Backend ` implementation of `FIFO`_ algorithm. -.. class:: frontera.contrib.backends.memory.LIFO +.. _frontier-built-in-backend: - In-memory :class:`Backend ` implementation of `LIFO`_ algorithm. +Built-in backend reference +========================== -.. class:: frontera.contrib.backends.memory.BFS +This article describes all backend components that come bundled with Frontera. - In-memory :class:`Backend ` implementation of `BFS`_ algorithm. -.. class:: frontera.contrib.backends.memory.DFS +.. _frontier-backends-memory: - In-memory :class:`Backend ` implementation of `DFS`_ algorithm. +Memory backend +^^^^^^^^^^^^^^ -.. class:: frontera.contrib.backends.memory.RANDOM +This implementation is using `heapq`_ module to store the requests queue and native dicts for other purposes and is +meant to be used for educational or testing purposes only. - In-memory :class:`Backend ` implementation of a random selection - algorithm. +.. autoclass:: frontera.contrib.backends.memory.MemoryDistributedBackend .. _frontier-backends-sqlalchemy: @@ -215,60 +199,17 @@ dictionaries as storage for :ref:`basic algorithms ` objects will use `SQLAlchemy`_ as storage for -:ref:`basic algorithms `. +This implementations is using RDBMS storage with `SQLAlchemy`_ library. By default it uses an in-memory SQLite database as a storage engine, but `any databases supported by SQLAlchemy`_ can be used. - If you need to use your own `declarative sqlalchemy models`_, you can do it by using the :setting:`SQLALCHEMYBACKEND_MODELS` setting. -This setting uses a dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. - For a complete list of all settings used for SQLAlchemy backends check the :doc:`settings ` section. -.. class:: frontera.contrib.backends.sqlalchemy.BASE - - Base class for SQLAlchemy :class:`Backend ` objects. - -.. class:: frontera.contrib.backends.sqlalchemy.FIFO - - SQLAlchemy :class:`Backend ` implementation of `FIFO`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.LIFO - - SQLAlchemy :class:`Backend ` implementation of `LIFO`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.BFS - - SQLAlchemy :class:`Backend ` implementation of `BFS`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.DFS - - SQLAlchemy :class:`Backend ` implementation of `DFS`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.RANDOM - - SQLAlchemy :class:`Backend ` implementation of a random selection - algorithm. - - -Revisiting backend -^^^^^^^^^^^^^^^^^^ - -Based on custom SQLAlchemy backend, and queue. Crawling starts with seeds. After seeds are crawled, every new -document will be scheduled for immediate crawling. On fetching every new document will be scheduled for recrawling -after fixed interval set by :setting:`SQLALCHEMYBACKEND_REVISIT_INTERVAL`. - -Current implementation of revisiting backend has no prioritization. During long term runs spider could go idle, because -there are no documents available for crawling, but there are documents waiting for their scheduled revisit time. - - -.. class:: frontera.contrib.backends.sqlalchemy.revisiting.Backend - - Base class for SQLAlchemy :class:`Backend ` implementation of revisiting back-end. +.. autoclass:: frontera.contrib.backends.sqlalchemy.Distributed HBase backend @@ -281,11 +222,6 @@ tunning a block cache to fit states within one block for average size website. T :attr:`hostname_local_fingerprint ` to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. -.. TODO: document details of block cache tuning, - BC* settings and queue get operation concept, - hbase tables schema and data flow - Queue exploration - shuffling with MR jobs Redis backend ^^^^^^^^^^^^^ @@ -300,10 +236,7 @@ items to the database; that metadata or queue items are lost. In case of connection errors; the crawler will attempt to reconnect three times. If the third attempt at connecting to Redis fails, the worker will skip that Redis operation and continue operating. -.. _FIFO: http://en.wikipedia.org/wiki/FIFO -.. _LIFO: http://en.wikipedia.org/wiki/LIFO_(computing) -.. _DFS: http://en.wikipedia.org/wiki/Depth-first_search -.. _BFS: http://en.wikipedia.org/wiki/Breadth-first_search + .. _OrderedDict: https://docs.python.org/2/library/collections.html#collections.OrderedDict .. _heapq: https://docs.python.org/2/library/heapq.html .. _SQLAlchemy: http://www.sqlalchemy.org/ diff --git a/docs/source/topics/glossary.rst b/docs/source/topics/glossary.rst index fb9ab6c38..6321b15d4 100644 --- a/docs/source/topics/glossary.rst +++ b/docs/source/topics/glossary.rst @@ -16,8 +16,8 @@ Glossary A stream of messages from :term:`db worker` to spiders containing new batches of documents to crawl. strategy worker - Special type of worker, running the crawling strategy code: scoring the links, deciding if link needs to be - scheduled (consults :term:`state cache`) and when to stop crawling. That type of worker is sharded. + Special type of worker, running the :term:`crawling strategy` code: scoring the links, deciding if link needs + to be scheduled (consults :term:`state cache`) and when to stop crawling. That type of worker is sharded. db worker Is responsible for communicating with storage DB, and mainly saving metadata and content along with @@ -34,3 +34,7 @@ Glossary spider A process retrieving and extracting content from the Web, using :term:`spider feed` as incoming queue and storing results to :term:`spider log`. In this documentation fetcher is used as synonym. + + crawling strategy + A class containing crawling logic covering seeds addition, processing of downloaded content and scheduling of + new requests to crawl. diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index d8b77477c..86ce98b5d 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -116,6 +116,7 @@ def __init__(self, manager): self._states = MemoryStates(1000) self._queue = MemoryQueue(settings.get('SPIDER_FEED_PARTITIONS')) self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') + self._domain_metadata = dict() def add_seeds(self, seeds): pass @@ -144,6 +145,10 @@ def queue(self): def states(self): return self._states + @property + def domain_metadata(self): + return self._domain_metadata + def get_next_requests(self, max_n_requests, **kwargs): next_pages = [] partitions = set(kwargs.pop('partitions', [])) From f5da2ecf166e213a0df3214f5269f7e050739e58 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 20 Jul 2018 16:33:26 +0500 Subject: [PATCH 206/273] local mode instantiation --- frontera/contrib/backends/memory/__init__.py | 4 + .../backends/redis_backend/__init__.py | 29 +++++-- .../contrib/backends/sqlalchemy/__init__.py | 83 +++++++++---------- frontera/core/components.py | 4 + 4 files changed, 69 insertions(+), 51 deletions(-) diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index 86ce98b5d..dda33851c 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -168,3 +168,7 @@ def strategy_worker(cls, manager): def db_worker(cls, manager): return cls(manager) + @classmethod + def local(cls, manager): + return cls(manager) + diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 9fadca6ad..dcadbdb31 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -378,22 +378,33 @@ def __init__(self, manager): @classmethod def strategy_worker(cls, manager): o = cls(manager) - settings = manager.settings - o._states = RedisState(o.pool, settings.get('REDIS_STATE_CACHE_SIZE_LIMIT')) + o._init(manager, "strategy_worker") return o @classmethod def db_worker(cls, manager): o = cls(manager) - settings = manager.settings - clear = settings.get('REDIS_DROP_ALL_TABLES') - o._queue = RedisQueue(manager, o.pool, o.queue_partitions, delete_all_keys=clear) - o._metadata = RedisMetadata( - o.pool, - clear - ) + o._init(manager, "db_worker") + return o + + @classmethod + def local(cls, manager): + o = cls(manager) + o._init(manager) return o + def _init(self, manager, typ="all"): + settings = manager.settings + if typ in ["strategy_worker", "all"]: + self._states = RedisState(self.pool, settings.get('REDIS_STATE_CACHE_SIZE_LIMIT')) + if typ in ["db_worker", "all"]: + clear = settings.get('REDIS_DROP_ALL_TABLES') + self._queue = RedisQueue(manager, self.pool, self.queue_partitions, delete_all_keys=clear) + self._metadata = RedisMetadata( + self.pool, + clear + ) + @property def metadata(self): return self._metadata diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 391c9f659..8ce136115 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -25,57 +25,56 @@ def __init__(self, manager): self._states = None self._domain_metadata = None - @classmethod - def strategy_worker(cls, manager): - b = cls(manager) + def check_and_create_tables(self, is_drop, is_clear, models): + inspector = Inspector.from_engine(self.engine) + for model in models: + if is_drop: + if model.__table__.name in inspector.get_table_names(): + model.__table__.drop(bind=self.engine) + model.__table__.create(bind=self.engine) + if is_clear: + session = self.session_cls() + session.execute(model.__table__.delete()) + session.close() + + def _init_strategy_worker(self, manager): settings = manager.settings drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - model = b.models['StateModel'] - inspector = Inspector.from_engine(b.engine) - - if drop_all_tables: - if model.__table__.name in inspector.get_table_names(): - model.__table__.drop(bind=b.engine) - model.__table__.create(bind=b.engine) - - if clear_content: - session = b.session_cls() - session.execute(model.__table__.delete()) - session.close() - b._states = States(b.session_cls, model, + model = self.models['StateModel'] + self.check_and_create_tables(drop_all_tables, clear_content, (model,)) + self._states = States(b.session_cls, model, settings.get('STATE_CACHE_SIZE_LIMIT')) - b._domain_metadata = DomainMetadata(b.session_cls) - return b + self._domain_metadata = DomainMetadata(b.session_cls) - @classmethod - def db_worker(cls, manager): - b = cls(manager) + def _init_db_worker(self, manager): settings = manager.settings drop = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - inspector = Inspector.from_engine(b.engine) - - metadata_m = b.models['MetadataModel'] - queue_m = b.models['QueueModel'] - if drop: - existing = inspector.get_table_names() - if metadata_m.__table__.name in existing: - metadata_m.__table__.drop(bind=b.engine) - if queue_m.__table__.name in existing: - queue_m.__table__.drop(bind=b.engine) - metadata_m.__table__.create(bind=b.engine) - queue_m.__table__.create(bind=b.engine) - - if clear_content: - session = b.session_cls() - session.execute(metadata_m.__table__.delete()) - session.execute(queue_m.__table__.delete()) - session.close() - - b._metadata = Metadata(b.session_cls, metadata_m, + metadata_m = self.models['MetadataModel'] + queue_m = self.models['QueueModel'] + self.check_and_create_tables(drop, clear_content, (metadata_m, queue_m,)) + self._metadata = Metadata(self.session_cls, metadata_m, settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - b._queue = Queue(b.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + self._queue = Queue(self.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + + @classmethod + def strategy_worker(cls, manager): + b = cls(manager) + b._init_strategy_worker(manager) + return b + + @classmethod + def db_worker(cls, manager): + b = cls(manager) + b._init_db_worker(manager) + return b + + @classmethod + def local(cls, manager): + b = cls(manager) + b._init_db_worker(manager) + b._init_strategy_worker(manager) return b @property diff --git a/frontera/core/components.py b/frontera/core/components.py index 1cda8d215..8c29ca17f 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -314,6 +314,10 @@ def strategy_worker(cls, manager): def db_worker(cls, manager): raise NotImplementedError + @classmethod + def local(cls, manager): + raise NotImplementedError + def get_stats(self): """ Returns a dictionary with distributed backend stats. From 4028a366924c20a6617d76c4c14c842bc7ffa4ab Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 20 Jul 2018 16:42:55 +0500 Subject: [PATCH 207/273] error fix --- frontera/contrib/backends/sqlalchemy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 8ce136115..d825e3796 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -43,9 +43,9 @@ def _init_strategy_worker(self, manager): clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') model = self.models['StateModel'] self.check_and_create_tables(drop_all_tables, clear_content, (model,)) - self._states = States(b.session_cls, model, + self._states = States(self.session_cls, model, settings.get('STATE_CACHE_SIZE_LIMIT')) - self._domain_metadata = DomainMetadata(b.session_cls) + self._domain_metadata = DomainMetadata(self.session_cls) def _init_db_worker(self, manager): settings = manager.settings From 61efc5efae3550a85a8035b5c457e4e5a0368c23 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 20 Jul 2018 16:44:34 +0500 Subject: [PATCH 208/273] style --- frontera/contrib/backends/sqlalchemy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index d825e3796..1d4a62814 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -44,7 +44,7 @@ def _init_strategy_worker(self, manager): model = self.models['StateModel'] self.check_and_create_tables(drop_all_tables, clear_content, (model,)) self._states = States(self.session_cls, model, - settings.get('STATE_CACHE_SIZE_LIMIT')) + settings.get('STATE_CACHE_SIZE_LIMIT')) self._domain_metadata = DomainMetadata(self.session_cls) def _init_db_worker(self, manager): @@ -55,7 +55,7 @@ def _init_db_worker(self, manager): queue_m = self.models['QueueModel'] self.check_and_create_tables(drop, clear_content, (metadata_m, queue_m,)) self._metadata = Metadata(self.session_cls, metadata_m, - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) + settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) self._queue = Queue(self.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) @classmethod From 97fe2ab42df8b369b2e2c2bc4ed02d5b74a0e2eb Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 11:37:43 +0500 Subject: [PATCH 209/273] fix of references in docs --- docs/source/topics/architecture.rst | 2 +- docs/source/topics/cluster-setup.rst | 2 +- docs/source/topics/frontera-settings.rst | 2 +- docs/source/topics/frontier-tester.rst | 2 +- docs/source/topics/message_bus.rst | 2 +- docs/source/topics/strategies.rst | 4 ++++ docs/source/topics/tests.rst | 29 ------------------------ 7 files changed, 9 insertions(+), 34 deletions(-) create mode 100644 docs/source/topics/strategies.rst diff --git a/docs/source/topics/architecture.rst b/docs/source/topics/architecture.rst index 2e69c9138..2777d44ce 100644 --- a/docs/source/topics/architecture.rst +++ b/docs/source/topics/architecture.rst @@ -102,7 +102,7 @@ Where *sharded* means component consumes messages of assigned partition only, e. stream, and *replicated* is when components consume stream regardless of partitioning. Such design allows to operate online. Crawling strategy can be changed without having to stop the crawl. Also -:doc:`crawling strategy ` can be implemented as a separate module; containing logic +:doc:`crawling strategy ` can be implemented as a separate module; containing logic for checking the crawling stopping condition, URL ordering, and scoring model. Frontera is polite to web hosts by design and each host is downloaded by no more than one spider process. diff --git a/docs/source/topics/cluster-setup.rst b/docs/source/topics/cluster-setup.rst index 26c51c117..f5c84a15f 100644 --- a/docs/source/topics/cluster-setup.rst +++ b/docs/source/topics/cluster-setup.rst @@ -20,7 +20,7 @@ Things to setup before you start Things to implement before you start ==================================== -* :doc:`Crawling strategy ` +* :doc:`Crawling strategy ` or :doc:`pick one from Frontera package ` * Spider code Configuring Kafka diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index e5356ea5d..ad1368e43 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -609,7 +609,7 @@ HBASE_DOMAIN_METADATA_CACHE_SIZE Default: 1000 -The count of domain-value pairs cached in memory in :term:`strategy-worker`. Pairs are evicted from cache using LRU +The count of domain-value pairs cached in memory in :term:`strategy worker`. Pairs are evicted from cache using LRU policy. diff --git a/docs/source/topics/frontier-tester.rst b/docs/source/topics/frontier-tester.rst index eef48bfe9..8b37ef490 100644 --- a/docs/source/topics/frontier-tester.rst +++ b/docs/source/topics/frontier-tester.rst @@ -55,7 +55,7 @@ settings, but also can be modified when creating the FrontierTester with the ``m An example of use ================= -A working example using test data from graphs and :ref:`basic backends `:: +A working example using test data from graphs and :ref:`backends `:: from frontera import FrontierManager, Settings, FrontierTester, graphs diff --git a/docs/source/topics/message_bus.rst b/docs/source/topics/message_bus.rst index 6f67af0c0..95478361e 100644 --- a/docs/source/topics/message_bus.rst +++ b/docs/source/topics/message_bus.rst @@ -31,7 +31,7 @@ components startup to avoid message loss: #. :term:`db worker` #. :term:`strategy worker` -#. :term:`spiders` +#. :term:`spider`s Unfortunately, it's not possible to avoid message loss when stopping running crawler with unfinished crawl. We recommend to use Kafka message bus if your crawler application is sensitive to small message loss. diff --git a/docs/source/topics/strategies.rst b/docs/source/topics/strategies.rst new file mode 100644 index 000000000..77188d956 --- /dev/null +++ b/docs/source/topics/strategies.rst @@ -0,0 +1,4 @@ +======================================= +List of crawling strategies in Frontera +======================================= + diff --git a/docs/source/topics/tests.rst b/docs/source/topics/tests.rst index 92aa601cb..7e678a607 100644 --- a/docs/source/topics/tests.rst +++ b/docs/source/topics/tests.rst @@ -133,35 +133,6 @@ You can define the following test:: ... -Testing basic algorithms -======================== - -If your backend uses any of the :ref:`basic algorithms logics `, you can just -inherit the correponding test base class for each logic and sequences will be automatically tested for it:: - - from tests import backends - - - class TestMyBackendFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendFIFO' - - - class TestMyBackendLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendLIFO' - - - class TestMyBackendDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendDFS' - - - class TestMyBackendBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendBFS' - - - class TestMyBackendRANDOM(backends.RANDOMBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendRANDOM' - - .. _pytest: http://pytest.org/latest/ From cb999cc5241b4d09a0fccc547fb473f1e4e6901e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 14:44:01 +0500 Subject: [PATCH 210/273] fixes of single process sqlalchemy tutorial --- docs/source/topics/quick-start-single.rst | 4 +++- frontera/contrib/backends/sqlalchemy/__init__.py | 12 +++++++----- frontera/settings/default_settings.py | 7 ++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/source/topics/quick-start-single.rst b/docs/source/topics/quick-start-single.rst index 6bbe8d18d..f72f3dd95 100644 --- a/docs/source/topics/quick-start-single.rst +++ b/docs/source/topics/quick-start-single.rst @@ -6,6 +6,8 @@ The idea is that you develop and debug crawling strategy in single process mode deploying crawling strategy for crawling in production at scale. Single process is also good as a first step to get something running quickly. + Note, that this tutorial doesn't work for :class:`frontera.contrib.backends.memory.MemoryDistributedBackend`. + 1. Create your Scrapy spider ============================ @@ -72,7 +74,7 @@ Configure frontier settings to use a built-in backend like:: This step is required only if your crawling strategy requires seeds injection from external source.:: - $ python -m frontera.worker.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] + $ python -m frontera.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] After script is finished succesfully your seeds should be stored in backend's queue and scheduled for crawling. diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 1d4a62814..9b46d2efe 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -31,7 +31,8 @@ def check_and_create_tables(self, is_drop, is_clear, models): if is_drop: if model.__table__.name in inspector.get_table_names(): model.__table__.drop(bind=self.engine) - model.__table__.create(bind=self.engine) + if model.__table__.name not in inspector.get_table_names(): + model.__table__.create(bind=self.engine) if is_clear: session = self.session_cls() session.execute(model.__table__.delete()) @@ -41,9 +42,10 @@ def _init_strategy_worker(self, manager): settings = manager.settings drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - model = self.models['StateModel'] - self.check_and_create_tables(drop_all_tables, clear_content, (model,)) - self._states = States(self.session_cls, model, + model_states = self.models['StateModel'] + model_dm = self.models['DomainMetadataModel'] + self.check_and_create_tables(drop_all_tables, clear_content, (model_states, model_dm)) + self._states = States(self.session_cls, model_states, settings.get('STATE_CACHE_SIZE_LIMIT')) self._domain_metadata = DomainMetadata(self.session_cls) @@ -53,7 +55,7 @@ def _init_db_worker(self, manager): clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') metadata_m = self.models['MetadataModel'] queue_m = self.models['QueueModel'] - self.check_and_create_tables(drop, clear_content, (metadata_m, queue_m,)) + self.check_and_create_tables(drop, clear_content, (metadata_m, queue_m)) self._metadata = Metadata(self.session_cls, metadata_m, settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) self._queue = Queue(self.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index a9dd86a64..bc0a930b6 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -58,14 +58,15 @@ SPIDER_FEED_PARTITIONS = 1 SPIDER_PARTITION_ID = 0 SQLALCHEMYBACKEND_CACHE_SIZE = 10000 -SQLALCHEMYBACKEND_CLEAR_CONTENT = True -SQLALCHEMYBACKEND_DROP_ALL_TABLES = True +SQLALCHEMYBACKEND_CLEAR_CONTENT = False +SQLALCHEMYBACKEND_DROP_ALL_TABLES = False SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' SQLALCHEMYBACKEND_ENGINE_ECHO = False SQLALCHEMYBACKEND_MODELS = { 'MetadataModel': 'frontera.contrib.backends.sqlalchemy.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.sqlalchemy.models.StateModel', - 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' + 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel', + 'DomainMetadataModel': 'frontera.contrib.backends.sqlalchemy.models.DomainMetadataModel' } SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=1) STATE_CACHE_SIZE = 1000000 From b8c1cac2f3c1fbf51579d4048c1912e456ae92b0 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 16:42:10 +0500 Subject: [PATCH 211/273] Discovery crawling strategy --- docs/source/topics/frontera-settings.rst | 19 + docs/source/topics/installation.rst | 3 + .../contrib/backends/sqlalchemy/components.py | 2 + frontera/settings/default_settings.py | 2 + frontera/strategy/discovery/__init__.py | 526 ++++++++++++++++++ frontera/strategy/discovery/sitemap.py | 25 + setup.py | 4 + 7 files changed, 581 insertions(+) create mode 100644 frontera/strategy/discovery/__init__.py create mode 100644 frontera/strategy/discovery/sitemap.py diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index ad1368e43..d9db1c7cc 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -141,6 +141,16 @@ without hitting backend on every request. Increase it if calls to your backend i if you need a fast spider bootstrap from seeds. +.. setting:: DISCOVERY_MAX_PAGES + +DISCOVERY_MAX_PAGES +------------------- + +Default: ``100`` + +The maximum number of pages to schedule by Discovery crawling strategy. + + .. setting:: DOMAIN_STATS_LOG_INTERVAL DOMAIN_STATS_LOG_INTERVAL @@ -439,6 +449,15 @@ Default: ``False`` Whether to enable frontier test mode. See :ref:`Frontier test mode ` +.. setting:: USER_AGENT + +USER_AGENT +---------- + +Default: ``FronteraDiscoveryBot`` + +User agent string in use by Discovery crawling strategy. + diff --git a/docs/source/topics/installation.rst b/docs/source/topics/installation.rst index 8f4ef86e9..399f6425b 100644 --- a/docs/source/topics/installation.rst +++ b/docs/source/topics/installation.rst @@ -27,6 +27,9 @@ Each option installs dependencies needed for particular functionality. * *zeromq* - ZeroMQ message bus, * *kafka* - Kafka message bus, * *distributed* - workers dependencies. +* *s3* - dependencies required for seeds addition from S3 share, +* *redis* - RedisBackend dependencies, +* *strategies* - built-in crawling strategy dependencies. .. _Python: http://www.python.org .. _pip: http://www.pip-installer.org/en/latest/installing.html diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index 68c0e4493..160cf7a2d 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -24,6 +24,8 @@ def func_wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) + except KeyError as exc: + raise except Exception as exc: self.logger.exception(exc) self.session.rollback() diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index bc0a930b6..fdf5d2aad 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -9,6 +9,7 @@ BC_MAX_REQUESTS_PER_HOST = 128 CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' DELAY_ON_EMPTY = 5.0 +DISCOVERY_MAX_PAGES = 100 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' DOMAIN_STATS_LOG_INTERVAL = 300 @@ -78,6 +79,7 @@ TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +USER_AGENT = 'FronteraDiscoveryBot' ZMQ_ADDRESS = '127.0.0.1' ZMQ_BASE_PORT = 5550 diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py new file mode 100644 index 000000000..f9cc1feba --- /dev/null +++ b/frontera/strategy/discovery/__init__.py @@ -0,0 +1,526 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function, absolute_import + +from math import floor +from time import time +from zlib import crc32 + +import codecs +import logging +import random +import six +import six.moves.urllib.robotparser as robotparser +from frontera.core.components import States, DomainMetadata +from frontera.strategy import BaseCrawlingStrategy +from frontera.strategy.discovery.sitemap import parse_sitemap +from publicsuffix import PublicSuffixList +from six.moves.urllib.parse import urljoin, urlsplit +from w3lib.util import to_bytes, to_native_str + + +MAX_SITEMAPS = 100 +MAX_SUBDOMAINS = 10 +MAX_DOMAINS_REDIRECTS_STORE = 100 +SITEMAP_DOWNLOAD_MAXSIZE = 50 * 1024 * 1024 # 50MB +DEFAULT_HOME_PATHS = [ + '/', 'index.html', 'index.htm', + 'default.htm', 'default.html', +] +DEFAULT_HEADERS = {b'Accept-Language:': b'en-US,en'} + + +def is_home_page_url(url): + parsed_url = urlsplit(url) + # XXX prevent exceeding hard limit with parametrized home links + return not parsed_url.query and ( + not parsed_url.path or parsed_url.path in DEFAULT_HOME_PATHS) + + +def is_accessible_domain(domain): + return 'fatal_error' not in domain + + +def is_domain_to_ignore(domain, max_pages): + return (not is_accessible_domain(domain) or 'banned' in domain or + domain.setdefault('queued_pages', 0) >= max_pages) + + +def justify_request_score_by_hostname(hostname, score): + hostname_crc = crc32(to_bytes(hostname, 'utf-8', 'ignore')) + perhost_score = abs(hostname_crc / 2147483647.0) + return floor(perhost_score * 10) / 10 + max(0.01, score - 0.01) / 10.0 + + +def update_domain_with_parser_data(domain, parser, url, body=None): + """Helper to update a domain metadata in the cache. + Body param is optional and can be used to drop the field. + """ + domain['_rp'] = parser + domain['rp_timestamp'] = int(time()) + domain['rp_url'] = url + domain['rp_body'] = body + if body is None: + del domain['rp_body'] + + +def consume_randomly(iterable): + """Helper to consume from iterable in random fashion. + Note that it converts an iterable to a list and keeps it in memory. + """ + data = list(iterable) + size = len(data) + while size: + index = random.randrange(size) + yield data[index] + data[index] = data[size - 1] + size -= 1 + + +def is_valid_robotstxt(lines): + for raw_line in lines: + line = raw_line.strip(u'\ufeff').lower() # '\xef\xbb\xbf' in case of bytes + if line and not line.startswith("#"): + if line.startswith("user-agent:") or line.startswith("sitemap:"): + return True + else: + return False + return False + + +class DomainCacheProxyWeb(DomainMetadata): + def __init__(self, domain_metadata): + self._domain_metadata = domain_metadata + self._set_fields = {'subdomains', 'redirect_from', 'redirect_to'} + + def __setitem__(self, key, value): + self._domain_metadata[key] = value + + def __getitem__(self, key): + value = self._domain_metadata[key] + for k, v in six.iteritems(value): + if k in self._set_fields: + value[k] = set(value[k]) + if 'rp_url' in value and 'rp_body' in value: + value['_rp'] = robotparser.RobotFileParser(value['rp_url']) + value['_rp'].parse(value['rp_body'].splitlines()) + return value + + def __contains__(self, key): + return key in self._domain_metadata + + def __delitem__(self, key): + del self._domain_metadata[key] + + def flush(self): + if hasattr(self._domain_metadata, "flush"): + self._domain_metadata.flush() + + def setdefault(self, key, default=None): + if hasattr(self._domain_metadata, "setdefault"): + return self._domain_metadata.setdefault(key, default) + try: + value = self[key] + except KeyError: + value = default + self[key] = value + return value + + +class Discovery(BaseCrawlingStrategy): + + def __init__(self, manager, args, mb_stream, states_context): + self.logger = logging.getLogger("discovery") + backend = manager.backend + self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata) + + try: + psl_file = codecs.open("public_suffix_list.dat", encoding='utf8') + except FileNotFoundError as fne: + self.logger.exception("Please get the public suffix file from https://publicsuffix.org/") + raise + self._suffix_list = PublicSuffixList(psl_file) + self._states_ctx = states_context + self.states = backend.states + + self.user_agent = to_native_str(manager.settings.get('USER_AGENT')) + self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES')) + super(Discovery, self).__init__(manager, args, mb_stream, states_context) + + @classmethod + def from_worker(cls, manager, args, mb_scheduler, states_context): + return cls(manager, args, mb_scheduler, states_context) + + def close(self): + self.domain_cache.flush() + super(Discovery, self).close() + + # Handling seeds logic + + def read_seeds(self, stream): + processed, scheduled = 0, 0 + requests = [] + for line in stream: + url = to_native_str(line.strip()) + if url.startswith("#"): + continue + if not url.startswith("http"): + url = "http://" + url + "/" + try: + request = self.create_request(url, meta={b'home': True}, headers=DEFAULT_HEADERS) + requests.append(request) + if len(requests) % 40000 == 0: + scheduled += self._schedule_batch(requests) + processed += len(requests) + self.logger.info("Processed %d, scheduled %d urls.", processed, scheduled) + requests = [] + except: + self.logger.exception("Error during seeds addition") + if requests: + try: + scheduled += self._schedule_batch(requests) + except: + self.logger.exception("Error during seeds addition") + processed += len(requests) + self.logger.info("Processed %d, and scheduled %d urls overall.", processed, scheduled) + + def _schedule_batch(self, requests): + self.refresh_states(requests) + scheduled = self.process_seeds(requests) + self._states_ctx.release() + return scheduled + + def process_seeds(self, seeds): + """Handle and schedule a batch with seeds urls. + + We call seeds only those URLs which were injected during the crawling + bootstrapping process. So seeds cannot be found during the crawling. + """ + robots_requests = set() + scheduled = 0 + for seed in seeds: + parsed_url = urlsplit(seed.url) + robots_url = "{url.scheme}://{url.netloc}/robots.txt".format(url=parsed_url) + meta = {b'netloc': parsed_url.netloc, + b'seed': seed.url, + b'robots': True} + request = self.create_request(robots_url, meta=meta, headers=DEFAULT_HEADERS) + robots_requests.add(request) + self.refresh_states(robots_requests) + for request in robots_requests: + if self._schedule_once(request, None, score=0.9): + scheduled += 1 + else: + self.logger.warning("The seed %s was already scheduled", request.url) + return scheduled + + # Strategy main handlers section. + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + # if redirects, response.url always contains initial url + self.logger.debug("PC %s [%d] (seed: %s)", response.url, + response.status_code, response.meta.get(b'seed')) + self._log_redirects_if_defined(response.request) + is_succeeded = response.status_code in [200, 203, 206] + netloc, _, domain = self._get_domain_after_redirects(response.request) + if b'robots' in response.meta: + if is_succeeded: + self._process_robots_txt(response, domain) + else: + self._process_robots_txt_error(netloc, response.url, domain) + elif b'sitemap' in response.meta: + if is_succeeded: + self._process_sitemap(netloc, response.body, domain) + if is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + def filter_extracted_links(self, request, links): + netloc, level_2nd_name, domain = self._get_domain_after_redirects(request) + if is_domain_to_ignore(domain, max_pages=self.max_pages): + return [] + robotparser = domain.get('_rp') + chosen_links = [] + for link in links: + if not self._is_from_same_domain(level_2nd_name, link): + continue + # validate that robots.txt allows to parse it (if defined) + if robotparser and not robotparser.can_fetch(self.user_agent, link.url): + continue + chosen_links.append(link) + # maybe ban the domain if it's eligible for ban + link_netloc = urlsplit(link.url).netloc + link_hostname, _, _ = link_netloc.partition(':') + link_2nd_level, link_domain = self._get_domain(link_netloc) + subdomains = link_domain.setdefault('subdomains', set()) + subdomains.add(link_hostname) + return chosen_links + + def links_extracted(self, request, links): + # if redirects, request.url contains final url + self.logger.debug('LE %s (seed %s) %d extracted links', + request.url, request.meta.get(b'seed'), len(links)) + self._log_redirects_if_defined(request) + _, level_2nd_name, domain = self._get_domain_after_redirects(request) + for link in links: + link.headers.update(DEFAULT_HEADERS) + self._process_links(links, domain) + + def page_error(self, request, error): + request.meta[b'state'] = States.ERROR + # if redirects, request.url always contains initial url + self.logger.debug("PE %s error: %s (seed: %s)", + request.url, error, request.meta.get(b'seed')) + self._log_redirects_if_defined(request) + netloc, _, domain = self._get_domain_after_redirects(request) + if error == 'DNSLookupError': + # marking DNS lookup error as fatal, to continue without discovery + domain['fatal_error'] = error + if b'robots' in request.meta: + self._process_robots_txt_error(netloc, request.url, domain) + elif b'sitemap' in request.meta and is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + # Additional helper handlers for robots.txt and sitemap logic. + + def _process_robots_txt(self, response, domain): + """Handle robots.txt successful response. + + The main logic behind the method is to create a RobotFileParser instance + if it's possible to decode and read robots.txt content, and save it as a + property of domain to reuse it later when deciding about need to schedule + a domain page or not. + """ + netloc = response.meta[b'netloc'] + domain.setdefault('queued_pages', 0) + try: + body = response.body.decode('utf-8') # response.meta.get(b'encoding', 'utf-8') + except UnicodeDecodeError: + self.logger.warning("Error during robots.txt decoding at %s", response.url) + update_domain_with_parser_data(domain, parser=None, url=response.url) + self._schedule_home_page(netloc, domain) + return + robots_lines = body.splitlines() + parser = robotparser.RobotFileParser(response.url) + try: + if not is_valid_robotstxt(robots_lines): + raise SyntaxError("Robots.txt isn't valid") + parser.parse(robots_lines) + except: + self.logger.exception("Error during robots.txt parsing at %s", response.url) + update_domain_with_parser_data(domain, parser=None, url=response.url) + self._schedule_home_page(netloc, domain) + return + requests = set() + for line in robots_lines: + if line.startswith("Sitemap:"): + _, _, url = line.partition(':') + sitemap_url = urljoin(response.url, url.strip()) + meta = {b'seed': domain.get('seed'), b'sitemap': True, + b'scrapy_meta': {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE}} + requests.add(self.create_request(sitemap_url, meta=meta, headers=DEFAULT_HEADERS)) + self.refresh_states(requests) + # schedule sitemap requests + self._schedule_requests(requests, domain, score=0.9) + if not requests: + self.logger.debug("Sitemap in robots.txt wasn't found for url %s", response.url) + update_domain_with_parser_data(domain, parser=parser, url=response.url, body=body) + # also always schedule home page regardless of scheduled sitemaps + self._schedule_home_page(netloc, domain) + + def _process_robots_txt_error(self, netloc, url, domain): + """Handle robots.txt failure response.""" + update_domain_with_parser_data(domain, parser=None, url=url) + if is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + def _process_sitemap(self, netloc, body, domain): + """Helper to process a sitemap request's response. + + Current logic is to split sitemap body content into sub-sitemaps and other + entries, and schedule it (sub-sitemaps could be scheduled as-is with higher score, + but other links should be processed differently exactly as links extracted from + crawled page - sub-domains homepages have more priority over others requests). + """ + if is_domain_to_ignore(domain, self.max_pages): + return + + requests, sitemaps = set(), set() + sitemap_scrapy_meta = {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE} + for url, sub_sitemap in parse_sitemap(body): + try: + meta = {b'seed': domain.get('seed'), b'sitemap': True, + b'scrapy_meta': sitemap_scrapy_meta} if sub_sitemap else ( + {b'home': True} if is_home_page_url(url) else {}) + request = self.create_request(url, meta=meta, headers=DEFAULT_HEADERS) + except: + self.logger.exception("Error on url %s", url) + continue + sitemaps.add(request) if sub_sitemap else requests.add(request) + # 1) handle sub-sitemaps + if len(sitemaps) > MAX_SITEMAPS: + # TODO global per-host counter of sitemaps scheduled + self.logger.warning('Amount of sub-sitemaps > %d for url %s', MAX_SITEMAPS, netloc) + sitemaps = set(random.sample(sitemaps, MAX_SITEMAPS)) + self.refresh_states(sitemaps) + self._schedule_requests(sitemaps, domain, score=0.9, count=False) + + # 2) handle separate entries + # current policy is to trust sitemap data, and don't verify hostname for links + to_sample = self.max_pages - domain.get('queued_pages', 0) + if to_sample > 0 and len(requests) > to_sample: + requests = random.sample(requests, to_sample) + self.refresh_states(requests) + self._process_links(requests, domain) + + def _process_links(self, links, domain): + """Helper to process and schedule extracted links. + + The method splits a given links set into 3 parts: + - home pages for domain/sub-domain to schedule with higher score + - links of interest + - other pages + (which is a string with domain name to check for inclusion). + After splitting, the method schedules the requests. + """ + if is_domain_to_ignore(domain, self.max_pages): + return + # at first schedule home pages with higher priority, and add others to a set + home_links, interest_links, other_links = set(), set(), set() + for link in links: + link.meta[b'seed'] = domain.get('seed') + if is_home_page_url(link.url): + # XXX it may look proper to tag such links with meta[b'home'] = True, + # but it would mean trusting to any home link found among extracted, + # and lead to infinite amount of domains to crawl and infinite crawl + home_links.add(link) + elif self.is_link_of_interest(link): + interest_links.add(link) + else: + other_links.add(link) + self._schedule_requests(home_links, domain, score=0.8) + self._schedule_requests(interest_links, domain, score=0.7) + self._schedule_requests(other_links, domain, score=0.5) + + def is_link_of_interest(self, link): + """Predicate helper to match important links. + To be implemented in a subclass.""" + + # Helpers to schedule different types of requests + + # The following 2 methods accept a dict with domain metadata and control amount + # of queued pages already scheduled for the domain, please schedule all needed + # requests only via the methods. Domain metadata also must contain seed field + # to track it when validating results. + + def _schedule_home_page(self, netloc, domain): + """Schedule a domain home page. + + The method enforces setting 'seed' meta field for the request. + """ + if domain.setdefault('queued_pages', 0) >= self.max_pages: + return + home_page = "http://%s/" % netloc + meta = {b'seed': domain.get('seed'), b'home': True} + request = self.create_request(home_page, meta=meta, headers=DEFAULT_HEADERS) + self.refresh_states([request]) + if self._schedule_once(request, domain, score=0.8): + domain['queued_pages'] += 1 + self.logger.debug("Scheduled home page %s", request.url) + return True + return False + + def _schedule_requests(self, requests, domain, score, count=True): + """Helper to schedule a bunch of requests in random order. + + The method schedules requests as-is w/o any modifications (except for score), + make sure you have set all needed headers/metadata/etc before calling it. + """ + scheduled = 0 + if not is_accessible_domain(domain): + return scheduled + already_queued_pages = domain.setdefault('queued_pages', 0) + # XXX to avoid converting links set to a list if enough pages + if count and already_queued_pages >= self.max_pages: + return scheduled + for request in consume_randomly(requests): + # scheduling pages randomly if they fit within limits + if count and domain['queued_pages'] >= self.max_pages: + self.logger.debug('LIMIT REACHED pages (%d) for seed %s', + domain['queued_pages'], domain['seed']) + break + if self._schedule_once(request, domain, score=score): + self.logger.debug('IL Scheduled %s', request.url) + domain['queued_pages'] += 1 + scheduled += 1 + return scheduled + + def _schedule_once(self, request, domain, score=0.1): + """Accept a request object, justify its score and schedule it. + + The method schedules a request as-is w/o any modifications (except for score), + make sure you have set all needed headers/metadata/etc before calling it. + """ + robotparser = domain.get('_rp') if domain is not None else None + if robotparser and not robotparser.can_fetch(self.user_agent, request.url): + return False + if request.meta[b'state'] != States.NOT_CRAWLED: + return False + hostname = urlsplit(request.url).hostname # hostname is already lower-cased + if not hostname: + self.logger.warning("Can't parse hostname for '%s'", repr(request.url)) + return False + final_score = justify_request_score_by_hostname(hostname, score) + self.schedule(request, final_score) + request.meta[b'state'] = States.QUEUED + return True + + # Auxiliary helpers section + + def _get_domain_after_redirects(self, request): + seed = request.meta.get(b'seed') + redirect_urls = request.meta.get(b'redirect_urls', []) + origin_url = redirect_urls[0] if redirect_urls else request.url + origin_netloc = urlsplit(origin_url).netloc + origin_2nd_name, origin_domain = self._get_domain(origin_netloc) + + if redirect_urls and (b'robots' in request.meta or + b'sitemap' in request.meta or + b'home' in request.meta): + final_netloc = urlsplit(redirect_urls[-1]).netloc + if final_netloc != origin_netloc: + origin_redirects = origin_domain.setdefault('redirect_to', set()) + self._extend_redirects_list(origin_redirects, final_netloc) + final_2nd_name, final_domain = self._get_domain(final_netloc) + final_redirects = final_domain.setdefault('redirect_from', set()) + self._extend_redirects_list(final_redirects, origin_netloc) + final_domain['seed'] = seed + return final_netloc, final_2nd_name, final_domain + + origin_domain['seed'] = seed + return origin_netloc, origin_2nd_name, origin_domain + + def _log_redirects_if_defined(self, request): + redirect_urls = request.meta.get(b'redirect_urls', []) + for redirect_url in redirect_urls: + self.logger.debug("REDIR %s", redirect_url) + + def _extend_redirects_list(self, redirects, netloc): + """Helper to add a netloc to redirects list within limit.""" + if netloc not in redirects and len(redirects) < MAX_DOMAINS_REDIRECTS_STORE: + redirects.add(netloc) + + def _get_domain(self, netloc): + """Helper to get a 2nd level domain and corresponding meta for a given netloc. + Returns a tuple with a domain name and a metadata dict from domain cache. + """ + domain = self._get_2ndlevel_name(netloc) + return domain, self.domain_cache.setdefault(domain, {}) + + def _is_from_same_domain(self, domain_name, request): + """Helper to check if a request url points to the same domain.""" + return self._get_2ndlevel_name(urlsplit(request.url).netloc) == domain_name + + def _get_2ndlevel_name(self, netloc): + """Helper to extract a host from netloc and get its public suffix.""" + hostname, _, _ = netloc.partition(':') + return self._suffix_list.get_public_suffix(hostname) diff --git a/frontera/strategy/discovery/sitemap.py b/frontera/strategy/discovery/sitemap.py new file mode 100644 index 000000000..3d3c1859a --- /dev/null +++ b/frontera/strategy/discovery/sitemap.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +from bs4 import BeautifulSoup + + +def _process_sitemap(s): + soup = BeautifulSoup(s, "lxml") + result = [] + sub_sitemaps = [] + + for loc in soup.findAll('loc'): + if loc.parent.name == 'url': + result.append(loc.text.strip()) + continue + if loc.parent.name == 'sitemap': + sub_sitemaps.append(loc.text.strip()) + continue + return result, sub_sitemaps + + +def parse_sitemap(content): + sitemap, sub_sitemaps = _process_sitemap(content) + while sitemap: + yield (sitemap.pop(), False) + while sub_sitemaps: + yield (sub_sitemaps.pop(), True) diff --git a/setup.py b/setup.py index e7e997283..df0e30a3c 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,10 @@ 'redis': [ 'redis>=2.10.5', 'hiredis>=0.2' + ], + 'strategies': [ + 'beautifulsoup4', + 'publicsuffix' ] }, tests_require=[ From e339e490fec5d65cc581f8df83a4592a8d7d9fa1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 16:55:46 +0500 Subject: [PATCH 212/273] removed distributed spiders run mode --- docs/source/topics/_images/high-level-arc.png | Bin 33927 -> 29166 bytes docs/source/topics/run-modes.rst | 44 ++++++------------ 2 files changed, 14 insertions(+), 30 deletions(-) diff --git a/docs/source/topics/_images/high-level-arc.png b/docs/source/topics/_images/high-level-arc.png index 542e3e145fdd9b63986b203613eb2cb5fc773266..cedb7eff859bbfb92cc28b2cfb6f84fe2f64b7a2 100644 GIT binary patch literal 29166 zcmeFZWmp_Rv@MFe1`9qwkR-S>xCBD*2=4AWIKkZ$2<{Ld3Bldn9fG^N2bb4Fl5_7l z-+S-p{c-NeH;|#Hy1S~nYVWnzUVB0m<)tvuNYG$lU@)a$OT2}Ff!zUqZc&hcBNQup z^}v6y4sWHzUsRtV0Lt2V8975FhsT~ zwMqiO2dK8M)f`}8(D9%@urNs}Pl1d5GJCHIQI(Sg8`xMe>lxbU8!@|B*#fO$U<6#i zz*{RLh#ttr%F@~a>>^10*a8f^haP6320b=`SO`+9$|-`xZS0LeT+E!ztkgniAP`8v z-q0BQRzmXc>A+ut)TR)KEtrMH+1Z)dnS2Vq_28?C;s^f6xB^Zt?e80xZyh|8EoVXO&~#07-|q7N^nM20&BDc0vk=>sbc@NioC+e zJGRvAGyKzey97KpB-RXuED(b}9BnYQH2Y_cJ-9osMSb)A)YQ~+jxxN3L)V4a;0lOMD`ScS_XNQnG5d6h_t#tyl z4c`r~N0(t2vv*mUKL2;YXb(6oJG2SY?UXxZTvC|~mRlcbRh0-Zs^XU%=%v=+d^Eo) zoqpu}Ig$3NWl^9Y6~79r+`g zV~}sel~LN|LE29=zfAV%vb*9FigJ=`*jn-f z+-^2F_w0HI8Kitte0hQ3W>G0Uu=;BF{TA%j(6AJa#bzoYhqFzLp5`Svx*T!D8eEJ$$@Mo zgDARq4L}ceQ=B|vj6_3i_VzXCByc0$;n)O_aW7wMzl04MoS)3?`y<0@!$vr#N0<^} zQv0 z58maXnkWwv^Cb0&=YiKuV~-(D+X2X9iDn9P^3BYwwvHFFp_^7e7BfVxo2 z#jhUVBK)T$WH#-w*BE27jjcv)#!8NLLEaT+6YuXj^=xGMW3vxJZPUtl#(VNwRP$nf zVg|5-cdn?MQi51>Rj{+NH{6K2iKCKNxjkgK8#t^Dwl169+!^mWgRn1Zy?-KLi{A^J zQqg1A;_KJ%4>N9wtl7(uF6#rL+tS#uqE*T@{nb?T#v zR!;hCRph}3s5lnW7$!1pHPL0SZ?puMUOk!!IE3B}ln@i8!6EV3FBQwPaLe~}vp7oP zNLG>z7P0Qg`$iG9+KLIYn#8JRVYJENe!qW&J=L#K31;1loxDCTA)X~(G@EoJZTyO^ z5y9#tn~f+OZJT^$_)b^@#oFKZv~OPuM54n4@jLgq7L|>cyVjyo&$)Bg-!A&IHDP@; z?Z$<(+oeQ*qP@BA3UyFLe?SR!v*OV|xwBo&pt;tqdjs757Wd^|&MV~Z4Z}QZRg*FH z3j!1J;%v2HdG5?HDrPh^=j5^C)(PLGRe8y$QOVc?zNYsVQ*-yooVfaYiG!lieJt@F zAXij#1hxhq5D6u|q!@<5{&jb=TlHNqHQt{k`2h~n*(BbagwAYBKv3`abR&`g?K+O8d-j zkzlUtI;td#>xHt4O&do>)vyO7@Y5B$NfChq@hZ6rl5BF3t#GeDg4Agv&D=zMH%3QoyC!PnxQ&tFc8HhW>Nw+ zr?@Z)#!5FE3L5k^7U2;s*Oii^KgTG?)EC++PxtQ??;s}s9Y1VLHi)DjFgTa$jz2LgK3Le^`y%+}`9y(D zKnk#3R?Plb%jh-)^US@1Pj3qb;>LGCuoHu3U(Ol#h8e&kW$?2dOuc$iF)L0;0b@!E zX#OQ}b7&|0GtGs?Vc(}_IW5BlBWq|^s!RPt7jS&Oi0_L$r6frb9mqseEPgdZNv1RV z&%jf|J#>x;i$bQ#*@qQP;olRziM=9s%o=?U{?o!tzL_gA*Ohw=?0!M*EF}^~#qC_i z-7~|mcw880`1dL}Jg^h@h9OkVk?I3fjcrCOJ93KD5#-gEjS1KY>`<%uKd=8LGvGr; zqewS+a#C%tNP%xdVwa{KbcN!gT7cQuQFn%LJ0E1H+zKJdy3nGzW5*( zwsA8$$MK5N{(%npg~};iJH_o!mK>LUM~47t!W+eq%EyJg;)jw6)#vb+ zPc*n)1plrywbvkRse#q^VKPya2u0%clWc--GBlwO#2W=)Mug{5f5bHCaVLAD#0tUk z?Tj`d_4@ohfG?9VPX2piAbFq-TiDX=Y)kKa-dX2eqlatTVE^=sxTtkQS2gQ6oqy=3 z1DD~0)ZL6lQ6jh1OhUTnbq@xGz?8?s-36Vn@S<(EN@eJ}0iT+7T%F_YL?GF}%@F+> zj_cDDEY7W&$NBg>1=oBvH{ttB0}oWg9;LBdxm6KJlz=`L1yMCtJ;lGh!2nXEmJ-x& zN7+q@W6~h}Vbj}txz-#+XdlvH%e9t|?)h0A_i+;);RVks+02ypfWLlwm)E%xQz~?D zwmsH>gh}GEhWB&Xw_EAP>x()O(AbCZS@enXcv~PQM}zwXcMzHVYr>WArq{?6E*aZn zc~Ml9e{7yN3NPGq=W&D1aJIYa-wvCTqf$!+WrOULo|l;{5cgF+H5bH~EvP46yZ#OpPfBTe&Ql?j3hg z^32ycN70$LuG)J1{{G>|leEGoe=(IN+=y!}LY&QHq1ME9=jUf&l|-6I-q!l#PSO%G zWt@v7i$XSJN{`}iDn;6&s?`K(MV#R=_0duPa5$My>}39Yh=G=Jq2@HF#Y91{CU{8T zG~8^lvB9x(Gu^9EtBuW+yZO^ji5aN>MrWZ?;rYZHj-^D6+1!+8j70kX9^V(**jip zR2EdZ9P1O!lkz(i1d$Za=Vot|98h`t(5Kk?I?q~FP2VjKzLFXcSnxO<;I_u(2qi0m`Rky9gG4%ccUI84sN}fJ$5}Vi+zYpXPKcJPXuEcJ zCVmHjibu0vp>sGNXqFjtVXcy#C&%e zqd6gfU5F$GL-JEOW`8BzTpZdA@*fIt*e;3O9JsD^e+m@KW8-g3YHQ?0cu5ilDKs5b zB>Ok|IFZX2hR5HIj9A?b23G23Rl^5b$tKCZos-BUF5T8~`&Cx0@`z(z-%i#SBAQV9BT;_vVfrl*gh#34SCD|(F z%2n1{yHB$t4x@;6Yi_#;s~SD7?D=73>~0ADVDxBrgtrIzd}rH1l%Cy!qe`&}n%z?W z>~bV@B{$WCt2NGDRh;3v?Eb6@Ck@+2pZNZFH@3@y4ibBKEuIGQxhA+gn?bF%r(PKU z_Ku0@BxJ(&*OaITM@+}<=`UDl52No8R-DEmVlOW_Q z@>$C-dw7teg&HcK1k~7Wr7>c`-{bEl_)6#!`#R$gD_NtZ3Ic!7Ns*r_s31dKk%%=8 zLrHe-J9{n**^6yGWmcP>&$JYDj3K{tjNW_vbu(Zy`{1slcQM(65f40H9s?wOx@a6J zxy^RxQf$Gq=*es;?A`L-FE33jX1bM>nbm~ACUNoWk?X0DVh(K9w`0!z!aC%SzBExB zw2qf%h?R6U^o>qH`oSWHEEL)gY1AAK7iDRVg4I8vhy*AerCVup^p~>Qp9AcX-eiuQ zTI@!ue*(1svmc9mL7{POqUwf>}8|(%42z$hWxG@H$>Uy zh`;vwi%oHnsHy(WK(b$hi^|7rSmt%3Bj49 zI>ucAZssGE`rkO-5$c@FOoq70`JMV552*SSnAd3j*mN9Nsf=)q5W61d@rX<{mnO2Y zOfso{syAfxOfO}<8DGB7*9Gi{nu`>h25jJdi92d}NrEXF;Ukdf)`fdsq;xn_@k2u1 zZ8S&v6`eTLfYe-VBoEZPp0Z!QeX7BH^69Ug@urEkm)e)t99?%_{AfbRtZBM2lx~ye zxG>}W-9f@qwuCX`Dgf!ls$dEPyE`?Oc ziq>F)gWg>u0bm)U6hsRD!j`Lv1qYFE^Sb=^L9(T5+(+(H*T78CtH-hcZXL@~7h zxPQfHq1>x8=B8WwBG0>P?K-FZdMKFD)yXeMbLuR4f=<~Pz!^z`^B0!{pys4zH?GHN z2*^sAYpnt(;9&Xgr{kdT?UCQbusAk>{Q(qHOjg4X9O>Egq)Bqlw?Q6 zx6|4rXZ2^&yrJtGow>TXZYA_(y$nmk_^j8%-A|gv-NL$p828OoXG^PT*ZTe0DtDv4 zxZkR~xg#~D#`S<(sgv8(XmE2pyjb+Ansrgwn+MKYI9cy&0OoAshf;Q66#!D|qP(6q zxt?w;0B$5K=L633L4x9l>(`wDYx%IEkEU|^O@>Din&5Y-N+%kpYgBl-JlHq$KH zWn0}SbDz=iyq0hnc{$v)UkgKz%`pBZYy))XuFdh`eZzL z-WJV|zklT#4q=h3_=G!-yX0B! z{Q5!kq$Mxi)13Z8X50Q;O_?izePoNY8_+slojSHS{Zy~O;LnSo5S)?cwQs}ZNg%bU zAIU(nuC$&v50d@$Q=+0$nfVJ#mWC9r_<+fzKySb>+kEr>0bH1@2uTvSBc07!)y_7H_VV!yn1${9` zdF!L&e;6tegeQQn4l)UNDX<_&>T%n~?Kmc#ot>B};lG@@2P~k<$ket5Q-MIdvLzbg zz=54FFIBi*9;|pfiOKshEaoorUx#%1&EjoG{q;l=AYk7NZ0JAKt)N_mS@0x(y|PY} z8!z~sm7TEcZaVmtycg=7{nF-A)?%>Lce@0IJ^6y@=ySUz;X+d?bpb4r4Lz`YGJ<)}LK2rn zD_szsjZKR$Vuz=>{r*e^#e!OpT=p~mcmP!hI}%%wToQkuthy(HdIV4(YEHdmgq{+T zL2^vw7s1-;nAu;7)3FVJ;SCUE>69nMs4L||v0lT_d-robOE8A*$AjD-W#keL~p*%1wKlkwS{b$E| z=)@U5_o*i5N*xZyr73u)UTLXTF7*7z>L++l`ZyUz#C5i4W4gXkFLr`Q^Zh#GeWy}c zJQvt_df^w@2Be!RXl?ITg3!m>7J-#Oyz1yM#%HitZbk6F0$+&E<<5bX<%HT>PON)M zTu3v1D{k4jPEkS4sSku+@kL~kBJV4De&^-YrxQxQcPED+ds8D|8xKXA`I?SY;qe4a z^WjQy^2J|X0CqLr8qTcQ%i&v}6&?26I-0~NYp>||Z4Q6MMJ%0L%7CTLrwvr>42u0Y2Gd=~zRqHmk0Xx&Z3^LxPO7L`ah@udz zj3nb5UEP%A=us}x-WNP>$D=U6NnS3V%;H*m!m4l;-5Fj@!vJK4;LM~%BEgsfSe^vi zJNbPE*At`(!h>gVn4QZGjMzzgl&z)X*G@}w015$kkBr13CwGMO37t` z-D&+>j~BwDa6e#={kGh^AKi(Fv z*eR%~eG!b3!vhc$XN^LMar~9T!%7ZQF?Z_IK(J`zewKZeEJ|$D&ml!qjNvk5O(&bM zbTVYUwLTXZ_lQ|#;q1qIa*~tHPI+FLSh4Q)kU8|TZ5-4ca+BG=3c6T)xSzZ|>0>dI z33N4}NL1NKbTuJn_2`4_&-_|XWID=IhP5x1)SW)WY*?D zvWd{ctpkVMiUfz*=(95W^*3$HYWQv+W{cuR*xq82a57x(sAZwrhu?=1yu|2w_YR$i znUlk~pH=woEan5iB47qaCIYdnYS$bPzKN;5ezy^Dk`Kf~K6}(4snnK94!Kmx_c6d`64dE~LoD3=)2R|<`e40(ac#c1?4+u_WzA73sW z>lw||&-z3iN%$_5(5_eZ<&>++_xC+u*QV^vi*_}Qyt1gt(OB2|juAUxhKWT$_x#F8 z`0{G~vT%@`LSg|9nGDS^D5zdR$>&g8QGG%C&(rbxymcPGW$1#ENx$r=+&rs{v%rOr83@=|F$Sa9`8LK;ihy4L?`q&=X=o;>iW#5x2c#w7!q~ys1xBhx`@1=K2WdGRu z%u7mO9MT}g7pXm}e)Stk7X770eWM#3LDIUUA3g(MvYZU;xMRqByH$TW7`DUv>d7Bm zhza-H=6ZvFn$)_wWywNUcq3WYviKmE_haUhE93^1mtIx>_#WrN^8kIUttMx39}AO%SIfqi26&E<)D zE=o(Y5dO0_spptJYj?9d~!*7xXRCteDNf$2i8NS!Ti^ES!Ump zzkwqFYk{ZXrli}rmLJOQdSd7>$hV1a*GmqDFgzjr;HyVH@3I<^3-CxA{&?~zcrRV+ zfLmsjyuE&JD6?kuyqqRU;6>!)_(Xb80bZ*s#%Ln#%C+5))IEz5GA!C7BsaIsqE$OC z@Nj#&A^dO^MLtU4EOtzJ$$_=L`8~~VYIk4bZw56w5Md>49L^}j%ro>8jg;HXUYSC- z!4bLM;aZHsIe9 z`q&0>4TMbUE{Babh4YgQ?#@NP(4C&dy)3F94ID`1$OAEPmbtHXMwBgQ4u%bJ&Q$;U z{^5}|$|Pu>Ej0>@pzKT66$0?!sbu)m<>T8+>R1&3ne&UH^>tM%zMvOSqvSOaic8LT zH9VjQ->o*+6rsxdMOhP{{Z~J^%40qYLb>w}NU7L+J2W74fT&96H zA1W=U#b)n6el1Q5ayal;7T5D4%}aCp&I6D*v!!;wf2p{uXxasQL_zD?8uGe#sGg_T zkio7K{=^S&`EEMY=Up!lU)m>%Ov^F|m`5RD)kOo7W?U9&Y~0o(W{OV8$O<@Ik<3B! zJxHgm$?aeX2Pf_9uT=hR12j_esf}5iCa#&KpQHNi%~|e9&-&UuJf{YSDTaPJgMus#ZJ@gUtnn zU8~NC=_zJ2mW%BrQW1NYGuaLAx4n`C$CRe!dPZl z>$O>lv>>0>J%Fc^j7jPnMqKhb^pPczrisRl1Z!?vOc)|sM`cdWW=zjm&8)979R%L|kx7}EfLVlsp?1=;1xyyZX-Ba^51#|M z`s+PEG;IY{sXpYw8i!4F_cfwT2&VwYkz8WXm82ki zH=aS;?)fPqov;B2&#j(b(@$B13Bd%)j`&+2)wUh!i9BNLZ7~q=L@1vnefXITP_jH$ ztVuui2LxuYwT?7K4sM2l5QxjA%Zg4e$#6`)$~u)CE6};%{@sjYB&NyFa4sGan@h;V z2v$WY@~caSt<0dL<+dQwxa|NUZB{||^Ke^e-jshk*JezMaP)R1GY zUV?L238A0S-roYL=9utX^eYG)`)3*$StU?9W`4+5Y|(Vv{u-DKKb0i98Lk3ve6}@W ze0zC}M2?23Ej6_(8Z-9x3 z{E}@va0M)s$_hnpA|Uu29rW!qUNa4Mh(2w53PhQ;b``u}*W3=01=@H`{p7cPV=Jyx zDKhlUxhFu_JqRYBSV`eP6{1_P;w~w3O_2YJzvq^6iW_{Gg{N;Bq5BerSonO(z<4-A zbmBPB4QJU8?;3Y1h~Vs;g;y9nH%-C@ej#DEmg9iivrR zpHOKvO9BCNS~@N}w_f4+Ure88vxUEW3@>Eqa4Oprs|WdAhSc-RRI00Kr-RfBgEp2w zKU1=09J3d%M}reDC;)iXb2{g`YVxpSGH5~vxgE=Bd!pp|o{e8>&r#tbA?`9_eIwLoTMimCG)edhyP4c2c6qf>>n+Cc-4<9f)1ar{AE=iEhwd2|P7s z%*Irtvu!5@MWq0*9qFJ0G>V~E@FO}KHi^1Vwyy0t!2V4ymuPa*AmIw6kb*@rWc}|t z_BRF+8cL0qjg>W1qL(c<*4!l;;o}JCnrC?Ug_(33>yIM!E8#fmOE`#i`pe#_fK{g! zk@!OZwm>DYLbnnNvO6(wyQ}k1y8BepB)Oc;df{H^X;cvftL%V0f1ePHC51_lD7RXf zYf5VS)6T`Q^ZnWVI>=NbuRx6PuZ2r$RRPOh-KOfgM-IL%t$pLjtw<*Hh*%ZzWkpnmQ}EEFomCW!O~nILW^aQI*Xtfwu7mHbGLTH_PGY?Y&{H}1))NxWPP zn`OpbV2`&P2O9^Rt#Ndv>J6gIWnCf`Cr4%y13gB7a7VFB*9B+P}q)9Dc&J{Ci{GRhJ zHC3JgTp1i|j!SN}PbsG&+a$O@T=u;2*nVL_vBX@v zMJ)s}l5-thN6z*D=G31arzqQrYbW(}BUz5T*)ku!0ZVe7EBRK=*jGt8ddUW4a!r8i zzIUE?oU8;$*m|_agusmcvfEn^ev1W~D1p*dAQYJmf1-(e>|8X7ndr>-#Q?YaHQaY{ z!!;wBS=rNz_IotUOl3urGcJJ=kE;{p6FAnP0*!f#XlD=FE;C2>&6Q6=MOv!hK=kLI#48{hwt3 zo}tFWfy?n+BSSpbpb%+9n*T!w*kYc5N;!T0Ggewx$@bi@R^qA`jl9YPSQTLNwGUJj z?mh+UD~G}%zciWraU0yuo!|2YmVyGIWj-*d`HhN0##?^!~r|Y+;wwjIwi*F z=QPV^yH`J_tIB%177kHysA69?I|NDb)onI zcvdoq?t|&$d>USI(|iuhcgy#VO@2;IT2(m)xqK!AUw`n`ef=R<_ffWwQH8_JVYO-B=i=L)#3#3$vsf~PKU?z)e}U9qhao~M{H_B>yYe4=ha=4FpM zW*d|qOt4`=Nfe#)HT{)c-s=zAq5NK@tvsOR2lSXrlD!+5gu9R#N6sfJ71p2f5e#I5 z9^6+A1SChwQbeX1S<*v^5}Or*NBJ|*i#^a+26jyHS=RR83;OLyl^7pb zFGOzzD|{LQz#@0>yU2Py>NoU?M#0!HryzCPS_}(}2%ibDvWz;a2OCHBO0C9X^Zh5c zA9;%Jx#K29-9`J+Pj~h#3p_A?23>lOBA;kjSqdOC`ErY}h|1D`Cg}3(BFAE*^dP_~ z)NEnQ8jz`GC+NC+x@PpG-D1Fa{gWaGLyA^-Do2JD4&~9*bt%tU_~h4@XE2373Qh#*p+ z?uUB_OskVvJyj}&>V`oNsWyCY7h|9c#~PUY@M_rHl((I9KMZFe-5T4{eb|1BZ%bj= z4`*znvW#4H8FSjEO3!b5w9rS$*M(-|O2OY$MMU7-b4zoJAQ+ou9z@b7K(3Bj!U&(K z0NO~tnn6Z0T6SUeEMGq<3+86|)QcbLuC*3&hbt*=A{sAhC}r|Kg^hw(0RGcFC>V?G zvUV7jV_sMsTYOEfYd;(-d|Lo96%jwaLpT!Q_7MQbatAE{cbm zVn`T6pQ2{%&;vz+pJe5paL$`;P=3qY8iDbN!+oKruVjljgCWePHuprc6~gy(fgcvCQ;$2Pf_B3zzzYbjV`sA#QY!7#+~Pw^}IABO)+udu%)gHZ zWAUC0;F4;JXbt4IW+|`zSFS)PTB!5QNqlfstgRADL1>z1FYz7O$P1JIM4<3=mFN;}$Eco_MTkdB;(Uz!?RI zBDCTx$r*kOzS98}AfAE8(Z3^&C~2byl(A2_@|1IFKHGB#n?96H<;DmLF$lyBh~p*> zlsAc`dR$ZN&Vwd;s}1G4b;{P@;f5#&v(|BCYHFALKb<_0OH4S_Jz(TW-3c2GP!tay zTxJN4;zpCdL`TC%KrUko~KI`upwN81XHf`1syKUO;NcASJ6xW9qsV%)w zj(J{(#Y&~6PhRm!o)Z&8j;tJhL$*F_$pvHU4`vzN(_keWVzqwmH;a75Q*inaG}$sO zP~fq{4OCr7Wz=f`B?P`G#T##t?YS@?4pfR;NGZrq`EV?-4KkB6rWJXUaji{s7h5xU z3%E+0<`{v|$xSGm8k{qR-p&(I$z?2t?oFTGvr5>CDwYcn5|Wje&}!CX^~i7<+2@@( zEP6LH4-03SWZDnQZV3es(!`Fm%weUX9fSM*D#cnN-MCtBH}w3l(aJmB312<6k+_^` zZnO&wIU~+QnJnVd7pq9juNAFzF+Mf zx=A5%3Qx;5ReRF;p&~iMlvn^_S4f}~<9aA`(e*{R14KCQO7xLCG`QX#tBH&7?A7lRvT`2OHc%v6EL5ws^mzr}Mhpi8@4QR6sR2$Pe zt5E@bSeRzXIK4sTitEdVnsjghfLw^EH8TRn$4zws@gwp_B*a-iluH?qEpuFySd|E7 zX0D?2{#ph?WCwR@+nLBEA)_mE6Z!TcO36Jfr)D`S@H`;s?#RP63QnXKLMu<)4VOd~!z96z~8!rkmloB6|(+`lFEk_1TSi6gU z$Uqvlz-gOnTg+44l)JMHmOu9h=E>JR&6|biCpLkHaKA?e!(Ail3-yVV1N9!wQlcLD zPvMRe5TnapmWTz7Vwz)&X!{b1)+?25w9@vu5lH+%{pmv|jx6Sw*(BnMb6SCo-%EWQ zT&d|vII=_hzoN z@Vc$R{Bgg_4mZ0C)MRM;72+g&C#FC6g>q;U3CRk=arq>P4~YvLfEZ>3A|>a(3?JhZ zZN9=GXD_0PXmYhTZHe%12wPVVW@0OU-LE5>!p6Np={2MOWt)N!1x!R^2V+FIDU;lQCa=Yg9#(D3qxI4#12S?rLp`kXT#R=A3}I_#QNbYp6;OY0Y1fJ_>wXf zxuz7O%k5RL6KA{ay=Mwn3mWD7OV&4G?E)lH0}y5(&mc{qOPsbK@8jsl1oIOyZ=Qk) z91GQu%F@@x1nmy~rR*qcnB{Kyskp7Zh>@5?5fu$fL0@c-Yl?U&29*xOct7aR!|%BiX|-tm%EA79jawhE)dD=O?dG=)pNts7xT7HbOqnhx2ZN;8 z4$cK$J&SA_xqp>QUnQCk^JSi+gMR{CPwtc8O*8IyecwHxWYK@TL~!9eSZ0y*-+g=R z@n1#RqyFsw+V}RUB8i&F(Q7A%lGpkvPrK7p>5vRT1SgEi!}wnkpp$0^f?dEcqK=M4 zIGdI2`jI6NB7y?vmW2iTN)#HM0ukVYZ(7yu+ zzzH~t_|N`IdbY&?vOufQ|AY$e8*MRfMc&vn&~k-VQD_0)F$Gu=xNs?x4P>#%My+Ir z+X4{`r^>i9h+iFqJ_#nGJc|c2*hs7u~vU*$2nlcsic!?U;oPc z6#==z|L?~nQmL6l^3#KsaDWt!i^PKbm}D>%M?5A=0w!mES`@f3e_{5UPkIPjIqUf{ zP9vPp-}GE=Z}n3FnvDJ$<=$_=tSG@H;eTbo3os-GrjxNQ1IXg8J%yDE{cJ!aKos^; zDm6?+%ZSitk19i2sdQ8Q`?C(5=xk=BaNmC%B02QAw0%&d`1@Qko&w$O@K!4R_h8|n zY>IZke2<*=w?BK)kRLW8TmF{dApF@6#Lw$~ibKuUhKyB66BXSQaB@nqSc>lXv}Yu- zIAb{sN_ykaUvG0{=@jW=IMM@Mh2dPUNz^I3Nt?dbp|8M?lnpyZGdiG>`4%__xTNxn zLHhYKJ@VPhNx+Cngl6n`vJ<#=tudx6OPZyu2;Df!cr!@H>OB&mRg`?m)G3*B^3Ss{ zzp&moaa8JHqKz3F$py4%?8N@{ZxxGu-@O zh>fqYs3ocyd;`K2|EG`P`+ijC!jE-HB5zSm8Kh5+7QAz2#mEL}`Y_-=ej4b?Ob2nh z4*5OyL-9K>FEK%0NS|{Jw-vmx-$#iuWj^?Iiaq+%*XVFH<7WeDbsntCswUh9#>uM8 zaV##EIKGA*I7<>w#WPu`z?$^ z$B~nedZ5c2WQDr+8A>mgH#|k{erhq!m{-8yBZ+cNKTc4KyMj|&Pz6ZJlC|p2!$B6m zzJK}lHbD53V5M#gT$xc{+VVY69#b_9Xo2Soeb)iGu&f^vrYxW*vz(f%aoBhIz4L4T z{PyZp4yZ~ApJnD{HKheArWiFU`+?dVM$;%L7^BAR{^~r=_|oFnv?Zesv(}htp%JYF z37lNehB%q^7d&~AbxtXA$5>)6^H^7)sH4APQriM3b29Y24oRJ1`!$0|lwgwnaUz_U zqZ8w=)EcUJa!S9!u;Pt|fnjljeiwiPpfgCqrIDN(sIM1rSop9KfIy<(Cbq7K&V{%1=|k3xGJ!HHz<2%WV#0}M(Y>y+8@K0wtl=qFyaJLmlC zgNiFK6{WNJKi%`c&i)3ffTk@L>TBfOfmA5>9+2w%P2sjhb-3h*eGZ7Lw^ChJLbz`S zcoxqJ>rdX;0DSzyREa?Yz?r*1rE-9P;;yLMjynUvs~czKq2Znd22lKN%Y$~3O;Hcf zzdA82?FOE8u|)G25B3a5%hW#Lrkqvedl_^~w=cfow4#_Pz08lQMfGmP?%pX0BYvL3Ls;v z(dbdLDO{*%yfOt6e+`vuC(4Kn(nA2>bno^%(L2x)J-c-2m`BTb&8;s`H z0m|6qeyV$&ecJ<+%uu;JF|TcgPYf50R3t=c|MOvNA5bZ=$fA(I+5*+WAz>{vFDJ1y zUbd2X-5zfM`H}MFe1h3>en9d>C%;C^io|6yR$?GkH+V6hm+F*M1Bhy50d?#^l<>Xj zix1xofnSndWO1E~mNmtsJOOW+T! zTbuK|+1~(^K|H2HJYhsEmDsQrlLs~LP10O*Gs_+T?LfpswJ*Yo4%M#qaNcnC@YCSQ zH*|2sq9R;7IPu>sr%UxL#Fx$Ae63(n#NhsI?L9R|^k$m(`5U6xFX9FF2*17Wv0q@) zM~uy2LoD0uG0Vx+F_=(C-&z*=zqQ^w<1*~V;m~dNrrunrcP-P4r~!7vtMihHTVMr~ zOf;H74glF!<2OL6{!OE*5ED>J4Pla>K{X%H`T>ji7F2S#txRYM2RBe543q^{VUf$t zSH{c&ySZQ$ONbeE7MluFe%)Pw6Hfn-M`)Z2u=CO92)%h9v1F+HTnIXeg1_2stff7{ z#+~MTj%HJeEiNTie^W-MHPbo6Nf~PjnaGqRw$*(1QS$Yd3bQsOB!eUC8ECOzAeV~C z$LljTf|tsiWI_SXeNfq?Qyp4dfl`rLS?5F!akn}bd9B5$u>;sWe$1yxvu5<{aB=ZH zAT?ULT>2tK;o8n!m2ThroSQ|!`sRF}I|3-|W8rn9u8gPpRF!eBI3VK7qbvOpc^+Zk z%%Lk(7GX33#%8WY{}$N_V<2?Re|w0!qesHj{kGi9w|*?7cq2Zf)2G?X4@vaa8_7au z`z_(UupJpckg3U3FE9tC7}PdJo{{#IFH$8}POlBHO_w|c=Mslv36Bt74pd3ilF1In z#S0^SCK4j%PZ?k)%VENOjScr*DX+`%HSqm5*;#l!5(W{b44lOJ*k3yo!NkdWm-bFW z=rb!0l{J@IiFI9z(oAu0i&0bGlpcR*KOOGCxk2S!6XwPqo$AA)1+TScOli7Yc1T|J|YH@wac9&t+WEpr1FiepZt`Pyq%lWt>fo>LV}nIU((5h=Ti=a@Z9m#n;fHL z2vhhy8~aUS9?l67oytDtY526HnNkdR$t^cxdzPKjO4W^6K=0H6uo!vDZu>U&b(_VC zEOme2IjjZHlul^<`-+ zuCBrzdIoZ4&oJun6F?yNRUUw~gEAuYshN>r%Mkd15(qwo?o%LffZWT_)D*eB^kIa2 z_;QS1Cz(C~v68#RuCMw%rx#wfh<~og!l0qkkYQRX3mwUGl@}FP1p#9t()4{wl%eJ6 zzAr@YGK(@a*;KOWG@P{2er0lO`-M6nI;E_8;CKD>rW45%kD*FLi6sFL?OXBm33}rQ zFtMQSF4g=^W(?JN^*Vpcd(NU&PBQ~W8%}IdHNNL>W;6r3iCr<3l*P#m21+Im0g8Zu z04_6aLxz?HY^IL;`YeY*Hj2D`D!kjd)GA{$L(^fDp_I`N#lyEg4Mh-BD@F85S-C9S z+k9NjNp!53w|+zri^)4Oa6^P0w;LaBktVxKY&GK}Rb*RSh{zit!JmikET3vA92<9d zJtI0PMhR^v0q5{(n9#o3v_~zM5YD8aXMnY{*BTS)3FE7+mk2#fbVV*Chbm6W^{rp z_<6j%fge2Za|aO%#cqm@tc;AhPh#AghdNbz_y=H01zU;f@Bo?jpM8}Xa>?`S^vsu^I~if5cbCib{k!ct-QMx|qDC-IC5Qgz@leHwi) zZypGMHTf2bU=b}Q=+$v)*KB_@iMC~OM|a-AiTU>b%I90Ct(I1L0{LDbABgpZaXSa^ zL-skm0k$po&obkIN&H1ocHGxck+eU}z%_z`Br4yWIUVFFC*^0_I2aAut(@m-Zg?(O z(;A2oK7&Dz2-b%eb-dkfZ^{38ZJS2ciodXo+kw$Ed`fqpja5mDyuPqF{iU<17J@|c zAnp-A@2xbW1ZV_EQ1rOQ51Csv?Tc8DZAorKys z{d`wkE>jYa4x|FbPKO`9zWL%t@Zx=b1Ph|FsH%}iaUR@vBK$if%B|Cqw5TKUI_JZ| zw)2VG+uMp+poXgffF2c7#`Le5{+g27xnzo3K=h|^~p=4T*Cl90;4%Ce(ttQqc5G%FW5ro7_i?Hp5L<23_R z#!P^gqwY6QNB46akSX>;EAblWxvb}=z&h=6wIxqVEvMPxyR3kMF(c!_lp2Sf@oySR zKF^~*&H`#t0V5zr@Mz#h)5p||^F!Fme|I1024D$_dr<*_k1JQrr$^6r8i4NM55l5Isi^`=g|QXV77&RvbuZe!{XfN>g;!MV_x6cVy1S$qkd*F}?odEF z1f?Vf=@?ohMWm4i5s~gvkY*SfRJu!2O5(le`L5sJ@UF#D*HUNBIdjf^?|tp-`b@w@ z-*JNY3ibhTv7g+Ye7u`I3<1AAaQbbziO6ZE5ptNV0?o=y05CMRfrQ^I$_VXqd$}2a z#Q;n4nX&MhbEV^om9y?CrB1+GCr|-?AYw*zQL2--W_$L%KV8aQ_$Pz!5;VTW0=tR} zt)P8B0_DgACvQf6cT-ID9FZ;Y;34NiC^}}t(l=YFXM`bW0@DuS-yeTaf@BwNNRdhw zR>hvCtRIa%ySu%S5$5$nnRmtfPhSw#y9B)G48Ex_?42k&N3kLr7lT`)IXrjMY0yX3 z7W$J}+d){~A7tPf68!)CZWg5nCl2y)%gY0^CEmbaXVxd<`cwD-STw5$Gq)E!`WO~K z1^;68@d>o+IrfJ&b}%)J`G^)hU2S|JrdP9QT77>WTj!6D^yI5s030Z|^wAv>mwLTd zb>0pPFxAS5=i5Pf=1w=t9$qu$g;|7+WEEX4xU&zERz zOuzGD<*n%LdoIH^;6(LJls=Bp{>qfh`P76jCP(hie?vTWs5}3=3koO8<08cpdFWHR zW28o7DJAibJAaPH&4=Ide3qBP5596MNbFV&6&yC(x{=}_PJ7gGDU||=R;U<^6gNvj zycz`6)#~sO}O3Yes+#+YLE}QTjcEO41p5aeF_9dr$JtT}~24?ox;uV`0_i zTOAaF(4iyGY`|T3)O-Poy&_cei7!>gVZ&Zb zXQJ5@YzZU|SVk#}%0PaT;yMATf-LYrb_wXj^+Kakm|p9a zrroW+L8k-40!Xo6)2W){9o>C$9#*9aK|5Hka9#MpiE02%n+5Z|2{^Jmvp{~1VSS~w zv0526sx7qM%ymPyVerid{aiFgM&Qz(a)-J2{&zAbuu^?gz~Y|(X#}adSP|Q^txCga z2Br9K0S+f)s*H1)XTaENxb<6X9+sjI{fKU6stqK*#D!3)ky=OHS|BKR0hC0ByzYLO z9Ss)4IDWwEnrQabKAY=^MN_0)-q(CtJw-;_q+BhC=bJ7w&ON7?A@00nJ@t5zE!?Cp z>V=zibHI?lFQAtox((9IbB>H1E7~D;&a0qf9;qFbb&mFxx!Nk+XxXjwI&U20RN%Y( zeA;PUoUSyp%(JpfxAw@EJ_K@h4_|>Lfv*Ytr8B6GXs|{DklD4RNE$+fs)H<@C?go z(bjV-sUz<+eR_j^zmc>>&xBn(@R7_m`$kxK zC_AIu^aP)p_-u4v3cSU0P(`K2HyLC->~>*CBnX zjy5ju$*nn8M=q~}&v=f-um+ZE!>W=S=2IS+?)I9VBL%#Ixl8n=&f$?v&rcDn(I0mD zi;d9yaiXG{0pcd|LVuL(cw?m?BXhC4*o2UBYqj3qQPyb$=Cac{Eg@!M|N|6)RR*0pr8o~c@t1cMuuR!Vb5gBANuozTM-#4 zm`INpMvlI}G~289WIg3RrGVWRra|rMJvyOxx-&}Q%(rr5rg!+@1y{$zf-wVY=83Tj zYd0=>tXI1NI!aUbnFZ+IMj~60Ok3Wl8=aUD9_=xU`+^1bP#B2a;^6g@gwREpP~@74v-y*ao8kLIo)Lq~k7vX~QC z38zcCWO?jmF@&mv*?KtKgCB7kIYYuwK0$rr3u=!md3sz4V$Y`}+~3GZn}eP?aG{j_ z_Nrnd9?`jK)r$oH+Nsy%?IP(%I6aDpd-WL^W-A;5JUK3o{IN8 zs89uEGBNxDo2SnPHlFSLk*o*^)n99zQKm$_2sG%`k`#+1YsUmawA%blAl5Y9cD0>W z$bXkh3f{Sz#H)AM9@VWXVLmyZ^4o`}#)*d0hvh(BO|Vw0y~D245@t`|>y2B-Z-hBW zX{Rb7+g=;UZBbbPP=ibOpCR0i>?A&42w{J76 z3XX6-IdtDGmBmx=;Y%hwQg}!67ErPf00LwtB{&E6No&>_QU^o_$C|EUwKvK2tV`Ep2X+Fsig#ry<}3qASJ)QVf; zQgMg)lsi>s?lMripMvDa)2aiIXgX@~-gRg!1!xd&pqlO?^TT~si}3t(;N^P753+bv&2TbJSa7=L@ZBw$aK?z zte;M06*$IY*6YT;rEZoV22T+~rPeX;;|sV?mNYeKXNlH(998~~12C<;GZ(#Dl#D72 zP;Q@WgsuR(2{*@>E)qx$R2%tU##y_L@=6su6;tv{P)(opsJqH^ow4G{3;o3$#|pfD`# z&27$oXk0m5-p0lE2ZZZf6Xb4vPiE~BX(n?=KwuExY4(1-N7f&p=DyOK;8Qc>Rz@At z^MWLBNX#}f^14b^O3DZH*ze!SCy-cAeEqEO?=7O_ApbU0Z5Ko?MSYEi)k(kBFw3#Ba8Wky7C5c&4I3 znRf`=i9E)cX2=nK^d>_I;@hNIX)k@nT>07#6}vNq<3;8>g8pTKK!VqI9B4Xf93l;# z=BSZL*ivb>XKeFQ*niOif(6YiAbr!eO*HI+ek^CMBdFDNFc`!@QS+nNo*z=Lc4NkD zM*cX)w}7JeKEsR)L5}ueg+<6I$=wucx5!XiBDT*w_$u)UfPXwQ(0*lfMqkjFBB=Xy z?7Nw1|G~(AW$+kb@rF=K!9YsM6(rPO(fXsf|Q{7r5b_STUei4jR{U!c9)x4)j zTrgcjFG%gzk3LO!Ngi{&-c~p*<@8T#=A$r26~xaj(Y>RPj-c9snG9~u+!Z|`mKRBE z_TfS!ruLE`%t5h+9yK$^q>{jk?bi{?{^7#h^I%yUNUMN|?5L-9#@>Q$JMyjp+csL$ zGy!Olh-AE+dq35GE|l1wba2C+CuLimV@Eq|G3gi&dnAlMm?Dr+aOYYs( z#8OE?*<2Ye434743tywU0WE#BfrwSvsyjjH5=Wf}T%Y@Cr6? znN!n5O7TQUiyaNcHx)Maw1E<4jB(x78%-Qin^@}1k@|HU%A{&>H6t&fCJld;%qEp# zaqsAU*`fqf|9C7~r>`=W3}%bS7tfO6A_nyNuJJ$bIF)d~#C+U_|K}NTpGEB%3mF{7 z)HQ0<0r6lVJNv@<=}|lS#ryS@cO38C`oI8(dA`Qz;bn3mAH|;J%UEoe9zQT+&*#sJ z0uPr)7MhvgStKG{T*OU1PABuq251c0_lX{~p)+<#cyg#=+}wSMZNB@r!@33Oy|lm8L%y$f)utN*sp#c&vf2; z?}Bj8_?BJj*z@OLZ(IQ4*bZ1e-2y;dry);^Up^@BsooP#XuPg1xFsksZuF2g$a~0i zbFR9<3{eQ5f{t7GY)IJRd!2|e{gZl~pfZ}Cr-g)in*c}%D+dBpGhjgM$pTs$w>Q`` zc`{Iz;K`yx3&qsPLKhA@?x?q`Q`7XY5;etH}?`Ee7m?6G4s{x z)kT6IK(#q+j~Bm|2)Vt!whFoNhSI|%_P9};z<3}yS~hsB_FB0&G!y3>zmBQjCO|%n zLHHBY5j@=fjKP-d$Dx|zH=`xNCjsA33+uL(jdBaUxYgvq7+^u(!|5dOxc&1|T^z@G z3N~HwaKdJWkY~GM&fJ`)9P)@TAZsS_Eb>fczn;)()_cs}G62vpr%p~zxc6tP3o#l9 zoK{xt-reOk zW9Fdg*lpN$;C6(BDT&>yiv#K3~?HP%(ya{qZTVb1>)QlC-(K)Boe!IIKjyx^~)KM=uhLUX#t< zl{MxbBR0{2^ekiBaQo2KycP19jqUvOu-I^!K*liOUxWNYXT*JlKu)d5=EXd(@Ahh! z&Xd>kt?rHX>0Uir-4ZSpKNoB6&WFA4L_BuQuhxCWVGY-v?w&f5;@SVUkuvDDBTtpO^|x4I+!xnVGKkd5jj0$@?um(f04NaYLCDuYPAZ{7qgS zK!zg^4F;u|4G`%6(h$k5?B~t2^nt&%2@qZ3)1n1Ie%Mu{PR4G^GmyPQGi{*@=*NCZ+mD5R8)X@)F{Hja zd9U6~Qhr)5M0eTm?Fn6)A=4Yuq4*{N-Yfr4K}@0Tld>~-ou~izI*&H~dcTl>xt7>V zYRTqwdk+cd*K=n;1h+pjWzfHG02!>4YEL#8^(jXYtsql>0n4FPk1f$I9+O04SrV6l zFgVh}{37vY6>AyXK#U%Bs@aG+G%&Le$8g#BgZ(#r*2|Ow1BVqX#scp(IES#SP6(V; z4&)8<&cFhwNYZp4z9}EXq=Nw4WdtcwYA0gF$ktSC$@2XhneTGZh;UiBJ$~$XW<(_cMB<=lk|D zZ{OcX*NOdZWB#eGL45ATmSPN%JNOR|K{H#l@?^zSC69ja`$h#p|p7O*g@&#DLc~5DF0pK*G>&aI(REdP)Ljw`7vGNYR16u$EfL5)-J4K zqxrs_CG$P43&<%c8yWmPvDy@oG<)Qj`eUX6$RGKuM8&tvgxwwo%9kNM8%Q!{)7PZd zBRL{3ME!_H5G>_J+ZuUw{+cD+MC+Q60(K-034bIadNv^IpkhYRBA+H%GPau0s7PE* zl`^U;x5K`n?=R`&U(_5<>OMyzeF4M)zC@t2OEN%>p2-_^J}5&vR81?x&ce1T@J+tWBwp_e_+w0;Xqz};C=LvpI`!J7L>bYkGwyMTVT}A^jGey>B*`o&SE)Gs z1yosrc%A;<(OZ+n=e`-KH#{iPkusH`G5|NnWXTKBUy}5^(hQ;Xc_^s|d52OO{`Z|1 z@+u^=Sj`Mr^H}Z4`Vz(tUMRb%j=2K`K+*vR;uHxYQ&`UQ^1yb5(*&ZC3jXrb%FKXr z-YGT@fY8Lzi6LC(Tk1~*ldpto#Tq}0cm;}--oLp!LkK;Nb$zoMDPcY#V!0}mXRV?< zT!BtLJcqXt6B|}QEl^4n8;m3hyO6~u;&&LvIi;z@!Q5%x2;CCO(>JG*ie0UgKGXXCU2<+p=i{>IM5rLK}d&jsdnX^D(-8i0l=_~b9(s}zUa z-5P+oq*m(PT;%s-I4jMF_Y*d`OQk7Ioe8TMLnAIWc?|F)k&TA$?fRsCf=07q&^|D= zU-i{u$3EcTtWPqT!9nZVnj_`Y0;spGfK1f_#)z$;S!)K}aw{N8Hp4Noo;3sVOA8pu zwSr2c>11=X6M)1__;;zr+8;!YRHapC@E?FJ{ByY6 z{aO|m49bRbB)obDXFeN`S~Pq90B48*XEP$>f7;J$*)|Ih48LVUZhwOT;Rs-J_2)=> z57xh0TtG+!UpJvNgfIV&@~wnN^|}LslN;iLkYeQ_f;Nlf5v|=tL(z2-#j4FUD(BK>6|IEjWxisT$g3ep_|JC0Cj3U)a#(WGnxOY5ZBsmB9BzFXKNsSV*YrfBuIT^yrknKS8N}{Q6)0wKligdw|UoUNF2f{L5IL zVsubupvF`MN#B*KN{k|4MGwL5@)do7?26-#V{2PhPvOWILxIxp86+nYA}!m&4Jhb) zXf4!o823@yj|C`AlrXzQS`m#?nR>r5-e+~h4AdN822Ez}2!OVN-aAI7buW_1c}(=c zEZur)2TiaUw*79Os69bCN{YXI1*BjaHfa&uMBdvtWXu1OcTv5(~QK41BP7dgLY^GQp6s}EQ6-FekRZytt9;%@ayno9Gil#$smC!8aB_;oT9$dOqd53Wi z7n|ImjJVgPSY!EnWcw2?gQA#EjhF|Yk7S>9cev!yEW|Ii`se~SAr$C4NmjuMwgQXy zJjRKPtgeFDEkhxJOkELBS^4LwIGSh=L`ZY5RsUv%nC&P!U=MzcfYMk_Mu+db83IBe z&o_-q^X5H^2LFW>0;7}VcUI;2n!bukH<9?pD{s2Q?lO8NMykHBC5=Fq9@s9Cj>BwB zAB&hTy`wbD`M?<^N#8a4yCnZMqt1`F6T52n7k^EK7zvIwwTNQyZQARTkX^y+K7p&$ zM`c{U)_@NZW0bP6lQ}cJrvWG~&cCQR?;ka|vVO-hh-b-d0d@eA1WziQB>4(#0cq~1 zY^FpwF@Z+d1LeMqvD78phi8NvExHQ9HjZfmm z#Yqb7guE!bSs(gI_Ex@3HU5y1de8ml>mN;T)7m%jB?si%^Dm8O#(q)iq-JOHWImWY zTc#c~Z*u?ky@`0R-u4HsRLN!AbHMixJABxq5m4iI;y@$yn&(}%84H;<^kHV0Q4%@< zsef(>HHFp&++Rfb69M1J&hk5mS}5wLCQDvRsi}r`qKHc&C!wQZKpc$y*MYX>kmB@0 zkgQb5#6phWqz$xumL^g#Z z37H!}&VQ-3$`FfB{^{$pUyP^B@hFE^69rgbe(KaSLN+b#qKRG}WfDup&zCRJIVR7) zDKjHp(1CMa2`YWwpcO3m#Iek)nc1Dr(aT2pp*_6#WgC{>h6YteqYP+UNh zu+yX(=-O>tk7OmEAr6oyOFv%Cw>$yxX}*B7y+^vTr{Xi}p^$O1ynLO}a5J{UT5R#2 z3x}d$QAoZqQg8L`UvQ5F9R(&A(y7or5Nl~NYjB~dSEqIh+|$Fyf+c%ER-bXj&E|NK0F{9 z79sdyK0O9mwg48J55Nkp!hPKRFXM16nxrNS4V)`smy$IZX)GFPuC=X05$zAsU&|rU zmKhXVuS@K6z^XevcQm|2yi|6uB#qa=_6XZoUEjU2-uQWX}jkB||@TaBNJdxB6~qO-K3$?W)G6$j~mQVDWNyE7}*e zrzB%Pz7qMUc0L-w{@sM1Fy&9?v(9UNju2^pb_CfWvvA8?n!fo=hGYD!@1V^eF40Dx5olr=ME}Ya%|3 zJsqs+d^kwGzI8jhLN9T2{s=se>7>pl+op7}S|MF2!Jl!XOv1P#AuI2|JDbajib_V4 z^=lctr%zzwXv9ho7#}D%DNT|$Ctdn_+7#&G0waWmc51RIDNNVA0Fesuu=L*#T3VOY yyVdLl&ZltUivKO%{~i4QPT~L8uWuBW?%37%Ki;kyU8BBOQC(T*aiyYl*#842El^AV literal 33927 zcmeFZ1yCJL7dD7%@Zb(15Zv9}C4m6J-CZv(!8N!`AV_d`cY?dSySwa=_x-;A->ur+ zs;#Zss@f4Saq(%s@u+w}_(!ADNn*B8iBNy%7l; zJu5vU89xFE2??*gp)vPoQSpDP1K;?_OdTC3|Y+4j^ksJy$ww2l9U!`A<8dMh*t{X10!IHr6C>?ds{k-eShzkmN+r=ywi z|2UJi!@u1EI>_+$3j-59Bg22&2CDMD9px6Wv9h%{a&Q3J=V#;nTk?N(?0=l+pYn>P zHjXxc73|Fnq^uo{?19RTdT&?9&-`!g|NRpG+qM+!&5VFv|EhI~yMq~9mPhd8$ga_;e>6j+>pajwXoQ&PTqzfK$c6K+D(?T7ZotH;hfsK077 z!E*G(tF>&_Bvv+NV2EelgC#aOgozmq75x7tFDf>O8fiKa@QX>9MN34sfES+`;s18e z7Xo81e6tM}#821xv>m-_tiO)#Er_+_PKHcWXMnZw`8^p4L^1}>cdDmbKQ^@&csy`I zxO=+kSB6uzphxmx7H^KuzK5CNzNh=vZATM)-99hHB1iJz1skS>S1ing;~rD2^Nmr7 zZySV7y6bF+&Rb7JN!TZgiioBh1x!R`3fm49tJu=nhw@!P z&CJ+Od;W@z$daY-)6E2S6*bb=qERJ0|8VQ5%86nBN;#sU13JVWPATYfzX z$LQCPM#7Cyr#2aKDd$ASc>S)p1V>O_vra=?!99P-mL^2h|r?R2AXOTA;^8l67~pPg$-rTOElYPa-X4o7icaxQrkIXF~%FL z(5@=-a_z2nZxBp@Qi+4?X<<;!-F@!x9Qw6GIij5A>`@wTjDy3Ha3b>dAQBY`~CwcC+ohf~{ z`?OjC!v`8h@QkE;R6;t2;7CQ@%eEYTbo>(EUZFZc?|>CQUQ?6c8OU!C5$ni7rj$ZUf!>NQ z;~=IPA8`>G(vvS4`P$bx$nmOqdyFwc_{UbN4C$ zr$0=1yUX|eHdK{lh2tc!S9ruV>H+BiedBf4BltCYU(q2LxV+akuftI2-ZsQ)2GOAY zZ3$9-s=I}Q{johPN!}luj1B2QAo#TS8SA*9;aIE{DMdF!A=S~c+Jy8a`_PtDpZ9)= zAt7hy#W8PJ$jcoDk@ybs=gvMS?(R$LSxV9_|7+Oyarm;J6@NVO&Ii50XXId7htH6B z(4NG}Hlh2%rc_&X`4#P{P!Sw(+#YR&5zyRE?5H-VyvSTUi?A%2A`#h!hs*pap@RHy z9~`D`$&vNM@N%vD+S**9m&bzFQq4Mmfu0R#@Y8zlK%aB&owU!0s4Eraad8 z-{~Cn1O!t&zSed-dkfYz5t2=IBr%t|MsmYNk=$QY!KEY^0`n$)zkw}AK;HF~2#K&G zfbrWf<40-$3ZUNk!5Iz^9mB_v?-+!lKB_QoQ>K3f~ zcja)F7AeuKQol{UZnpb}H9^@npC_}75Tm?ohVHA&!YjMrM=OK~zZ)M{NXc5F)P7MC zR;U9~iZFxRkFke$Qr)+n;hfJx&538P83*TYQkVdW4RUf4rKS$4k-1j_1~Cp z=A==8|QVu)r7Y| zWnT78&6#4UKqX#9U#r4TrFUF^pCh9tyyBjE@K*90-?Td8O2#OPa)O0t2;B$V+>p;_i8CA<)I%rh1;4k^E!(N1xxcl0D zFRw;v1y#0)-$ zlE@!x*cz>K#K$#r0nrVu5w#Hs;a7ZE&I|aJI`pA*)l2L;!HlxEZu|Z9T%>|wkeHP> zT*4+_x@UwFt=%N}$ax{k&b@M<(|Iw^*ZPUF#U0%t+Q7U9pMrf?S{qogQy0$8`c;N8 zs9_0*Tso<7; zm8Sv%dnM&64|#j(KS48jwVey}zEYgxvtl9Nd0*nc>Vyn+4H~}c^p3s)BZVT2V==s; zu1GX~F24JR*gLu!Y9LlXo?}$i!$y!t$RdkrY&EMcn~zi-9DWS9>c9xy@qITsVvjw>u4Wz#%Y0xuAZ@Bml+=J0p{d;AjC(qb4e=sRTn~I^R7&&dCwK?iz6@X zJrFWwrj;Dp+JqhN!XL5saP~C%xt=nTcF(zmS-C5MvN2J+_N!0*%|*_`BOks5W3#5y z-sX^RL;5WOp1arFUC7JtlMTd@4Voi`GKPJxz^y}cSgIw_ch?FYd>>qH2bZHnrd1^S zUi>M|;KNS|#by5u^!9SPsK{%jU*ti{-Y6|_$4K`jlvwoe5UIqz`;OXzk^_S#N?qjGsN8*LmI)|YkvectifG2%?cx686BX!Z) zcs2s7#tGdms2KdP`Hwa7jKCs^85GXztJ;~7?cagQ+%Y%&qi8=srH~Txh)?k%nr#eN zp@#%1n6PRaHuvXbW;Kl1)Rd2n5J&4~L_TcxKlCP9!`~BH8LD)`Pfqi%V_|Z{%g^<0 zQ_Sq7l9hWEW5R8Uh`^d679hX38kmu=DKkQYBlp$R>9hK=FNbcJc3UcXh;d|&@7??O zYA^6&b5AV4{Sp%lOL%>cN~hjRUAu6G&q^0^B89JC#NC%7lJI-^x#{2=UHp4rZ*mR~ zbQc8!Hb#N>-`>^uRT_Coq?d4FM!^O9z-^`YtSaq3s8QewWcnE;DD|D9%8c$2pzEgV0cY8PV4~3$d#4sC(Swf;^@JW`xobLj;8(YA{Nt zrQc{-zW_M*K8zyLC=H*$S4gaZRRq~tfYeAzW*K+ZeEak^BKMSvutAmyuU$LW!9&EC zpD=uLp|%QQ$tGZcA3-12q^Cgqmf9$Nc#u0^K0-*r_bX9~DKrL3k*g#jhd#fmq&ZYD1T8C;{fRENyQkEC(J*%GhbXP(mwtA0yROF zC8RXb`PGf**J4sJK_JjVRY^8n*tYDabh2nXaNxvvWiiX#S`x?7Qxu4K!OVI2CuH2} z(!bw2lq*f5B{BRg?HSyrtd3>w&gX<3+8jyqW>FAHbZO3r$b=)@f0;3txfUew%i9Z8 zA7a1uVdPuF%iSgli{%{S#PcHca^&0am3ya{p$3K2p7pK(Klw2R^It3{uOSVoiRwHlL7+g?}R5%JD>Tu-YS z52vs>gF$jc3l+*G3+%++8@2MdoLKe?eS{#3#)tUjjAC~WP8VsC-J9FcU@1*Dlh@x< zi&~m|b@cLhddogW|V--!VY9$)ZhO}gKGnqyT;$K$-mc&Xmb=rnivx+f^UZq;j1({WrZ(8KVr z@uGbiO<1Wg8dRPr%s9;iP<+v>Iryy>(RBLqhwH;sWI}G`*s#;}AY%6AMyINgyWCiL zW<)GnMvRZL<~cA3kb4EUmrrx*C^Ats3Djl1}^fucYIv4F5hNN4vrPS2DAa zG$x%!?avCC z+@9Ltv6&l2avxANMS1;eNRm}3h$w>hd|4>yc!Y6|xGJ;GSM(=`*f8=HkJc4ESVX^j zXg1Xw9SdiR)mcu%7+TBqfII2><#F|O;eJ%$b*XEvZRKizlBUjkH{FGo)2_+mcG+do zb3C)b{uvyWU`@+uTE&@byyBx=is7WoyiuacN$Pvj*JhgV49LXDMGrF=xkD83s;hPEgB_~ld%8*7=-FZuFp49#bLpu>_n z>e()?R3?Mox!-$(fWkd%$#M=y-RVz!c1tz7C1S-g zZNAr^>3xtj>J|C{=?c!p!|aR23oi)&8Bs?#U_>Jmxtz_+Cvro$02d^8y3#_!;eJq< zf0S5A4N1mA$m7f^@cLBcnR!HT(MRlUY2C1m#iHLGyf<7#OUZ+^sx;~6j9-M_fCY?= zu#oP~P;xDj%xDIm+|Qtg8jINlnA~)3r{6Kw&d}TWY#KG@DioTCWYP4#Kh0#Z82pbW zx&!qSb-Wsm{hPg>-7b%+M)BCmEGF|JmP_Q*xnWLok??cv?mlyyvh!~eM|ck&mNt!i z{Xu!ijby4o(K_1bT+%cf^R-P^6mrYmM%W9< zaG0PtROpHl%dO?{aQ=ex@j@~d-M6G{k)VZyh0<=1ql&(Tvd>63jP0%NMEoB0*<))% zH)Y0k49*=0`U8=eACRrc0e8(0VR`5$p1%`HQ_6^~Y3r+3hI-o=!N(c+4p+Z+*;&KA zO)qhDx=1zk_4(ABo&4xiC&Sut>*EPKa$?h>P1~|%!*)X8F9&XKyv8bW=uS@yxVUlJMAE}xR4r|L z1GRP2pW^7=PXYHef>)a^0RGpJ}%C& zGtkO4tx$Qm3^Ng#&T2YD5bpZgoz*O_zT9wJZpA8m-M7-`n2Lh)5nLf;YvQkHq{5DD zoExQTh>ntl8cb{F-pbUX-I3s=?PPUZT8Dwtz*WzThi`JI6r$>TFkxd~Y&eU@>s#IJ z4}KqErIFUP%{*IYIUiFsb0;0q>&!$Yyr_iUFmyMk9k^3#;>0ar_qgORiW}ahThupy zto#we3j*XVF?`F!&~g zMVCoA4i#kVA(t^r-Ck`10*Q6NEmrgi*SN%&W;*WBs#FzG5ndK3qKf&DDDA zxs2bgxR-qkN$z@g+2M1V53Zgml{7$8wkX; zti$K;fq=zcWr~LQfayFyk(k7HyEJKiDBbOT+EQmV)%g1IxR@Qq)Tmd3f__{xtL-ch zdEc(8^LWx!7WVlIT7Qy{#gQJqMDD{S5CGiwVvp9foPIYYY~5`6ux$L0q3-o0Vf#36 zXa5ySOp@e`4-O==ZDWKVE|bB9Buw$62v(;g|? zs&pU6&JFeNq~krKx=xTWDCMhH?X~M7%L=3??*`=9zv&%XvlFfJxU+Lg^^uVmI(4;L zI)1tKU)UEd@D$B*g@bB^J@<~;_oNP2yLv_0R3DH_K{zRJh%LDYR`H&?;2a$^l7$#i za;PuMy=CG$NP35<_5ljv>$TPgk2u*k2Z>HKmqoSSovj?t450AO{7r+YaB=kDvjH?^ z0B)q0-=-Y`Kz^5pa~wNR+!bmmi%L_$M2l%lb-HB!E@uP7O{J^m1B95EU}R%~d?Gr4 zYf(wxr~u)6u`97GRG>1VkX9c6vHIh{no`H4zJwe_iPF@_LxuVZ7H8jwk}gZ<{2Rdj z&xt@w4t>~Uv23WII>s3zr{aW=QfRcIahEWbUf>_L0sE-T`pxI9k&0H16;}hE84dy3%mKC4U;Ppw zbYEI@TdjI@0M_?(B?qD5%ADkzuuS0CReWfH&~N3WTaF?pabPw;I4H;j?chE#-T0&M z&%L2(FFwT8Li+nMlXJ-PC$|{@U6^tMNcyDtQr?qK=2aZ6pWm0n|EJ+$!7!?lzoa1x zqgGl;@Md>c^SIO{XIkPLB<4DJb8b7m_%_vW-p|i8o{r$Km?@req3_`92WNvPYRJ_$#Nk5WB?joYZztLvcu?&vo2)mjCbOd< z+MFPXl|+l<{^!HEGtwJnkIw@(kHZ|Rt^jPj?x!qRckEZPssG4E2@ycnZqv!&pO&-3 z+hs@=jR^rdmEZ$-OP9+;@kzWDN%^w#VzLM_SNEHf70oJuOADEL5J2d>uL`Axm9{K^ z2f%d{0oTOU;Vi?HB0_w`n zABLSbFfQB=55h*p`yo zg~1|?y3*NFEuK?=YLzX$)Ne*!ibvpWzk$$|1PyDmu7*~TAY|qJ1#+dZoxIfWw6C?db4O#C%?pXOXxh;!E;pJ)tv6KGT(GPqtWt) zmy5pSN8SS1%HV7b_DgR!_T~uyvA^~t6F+9HlMio~y zJWXi+(0sM!eLz_BetjNho7FUk6nJsPw*?qX3SjX0DD7}!ffaxu@xaK(6OA3~zg#4? z;V6eKZa?2Fj#k^Ou6U7bo^_#cz!KQ{5CRBpGH(ZndsT~CwjI#%kMsaZY9-F(bFW3Z z>{Ds@Vf-Hgg9sHmqqLb`X$FutStx+0w-PD`AiIq6-Lkr@de(8BwL=zJ{)y7(1n%Hu zz0GPga`8EE3HsJMeHO!hRDbk(^;VAtp_+QL>EH2-OCvh=0~CRo#ku#Jzn3Ig{+)Nx z&=6vW1sL0er@2;$XFv=dwXQWD#;bCxc`TRf$#85P*~`Ud(6)@^yHTS_NNRFAG|uq2 zkUHFJApG4)L&);zH(7YuG{Nev$u21HpEfBV#1{EzlwQtH=GQi{YJaE7;~@b zbs5Z~2t4T$@-(@R*DD+^)FAQ4=&6-xw|RN`->$-(mLlOL&D)r6=8#c5FIyrC|Jx)q zC2)@qALzd}ZD^wE*08S9RW+vk(q?_n;(;<;QEe~}Pno9Mh7=)g7MSB-L5)w>IqU-t zX=!%9ov2fLo~@KDq{`qQXmpeAT3pWzPmZmco*~7%p!%0^3qnm;Uk{(sk0vpZ+y2>c z zsNn1B14F1_J)EU!1zxG+CpIxzl2ZzE$1vu-#WV*)%kklarPA+%zmF2=G(PguKgi=d zHdRubso~d}j(4(}j*-m#F8=-l@xK4xv2RD~Z_3%>NE}C*C)D{u?2E1AG5)pV(U0}7 zS_IhH3{A4>06(Ic_p7(}^YyHdh3g-E9#;7(%{MZ9%#o-4>t!22Ln&;iJb#)# zJ#NqiVhL3|ZtDdA-dYf)oM`(njOPlZ&9mbFFFNDpbiWzW&|pH`Bd3 zJdwuI-Zf@6&h~fR3^^5TO>`aQr#!GVdfuR+*)?Wu@+l!2&q$0vR;s$0zmi6JE*z`R zGQ;&uy*r**_uNgHtC3^gR2#tpxZFcD_3IWcR^LcUj7>1Iu0SM~H~9xZVXOq#VUS^Q zzbi1ggZPaA3|5TKX{)>Rb!Pxal*qJEuL*Z7#&9u6*V7<38~fV`@fZNSS2%Cee*|%B zH92bqD1^<3hdsiW7U~#W4s+De-Mle;cz{G- z9>8OKoD!aGaoeMO#hsY)gOZTeG?(!!_n<{T6Dgl?3#=rdFwZr zl0)zCf@{ojuvOji3^HRZ+IQMxS(6d81AuJu zT&}pEmOQNaqs**j|Dd>cS*w3V2?C+$*EGsPp#!r(UR2`nLYgF}*RYT^X`=QVel7em z4|S&3utt&2lOj;(*BG_H>;>bq!K%17N{Y^=mU%E&9eNS0~FV3=vGqsR? z2?41-1Wxs%3QBqMDyw3k1!{D{iQhKd75rmOPmBnC>I|5hyI<~qV5wmQ6N}D1T1-UW z!;dNDyX#!{<$%x%K4h6UOTSMUKnHhD)t9mG7qY5+Z?v9+%Vr+uFd_RclVABa{gasz zf@l^kT)$FckJ*UENA;Mj9OF95$C}DmdG3MQJ&9`$|IooC;~Q~ZqO*);kGu6C`e(~P z?LTy;-)^Aht6dKthLkif3aw`Si3Q!vn+yYi z%q&kX)%f9R-@xtqz$kw&Cf@GF0j(oS;Kh{iVsAX>IF8sEKm=`X@`_d4E1ZKDm(j;+ zR%|oHG%>L?$=n;vey1qQ@ApQir#~cw%7iOB(R4eqp_QlnDF&3VPap>G`biz9hA>`H zXOtdg#$zcJ*oi%_eq=*F-EBq<1EIrujxS8O$oz4d`6e@Oq1H;{e!420R7|X$|Dnoc zl>JCSP7XjQ9*@U$I{@XcGY7;Kp;@L!!SN;+o2u;ujjti3xkE*{N{Aqe@=uYycpoBT zcI)ER<`6a@(`h-N293)C)U=5qc}V-GFmrA(&|N zaa?BC2aonsZv(CCEv;7^fa6cfmv!Fsa}BzIgPzZrMOw)U&bNB4$=-mZCpeCQ6G&i- zN{~>@#UPUowkmZkgdfktsfNx!5$pDjbjd%{uJ;!5UF#eN&VYfO%%Bzz;y;9B@|O(u zVu9(|TE|XNS!?Af@%*=X<(@$KA(*7~p7B<`9|W*H5O1GSillAAvJy z1e%xUPBru_lW!gj1JhwuWMHCePo?McWLm6aB(Xro^eho5*L(>@UjsZNs$8viR{S@? zMFSs6&<~K>h-X9orXg6CVIhWYm+=ZuZ6Gh}!<9Vmx#%gt6|T4@Iqd3SD%^AKFVW#rS$yKP8Nu=H@WcocigT|Q zP`FG1B|N{KxDw7xHM_bjLcbnMc(55~%qT{DCI3#_hAr-dk0y3AaiY}`t5|yn)`*k% zD=vtUU_;GCRO>6=YIbXT*~J(MHSHu2Ukur&0hq1~TWlp3O8P#D$zVta zz|H3J4eqxm)144vNxC_tB7t)-S^#Pm^eeHi3k9%E@VP@cG@~buV}oVoGCF>6M@e$V%he5)4(sga6X#GHTeXj9hgvf&wPXuORF~rbo+?Q88Un4 zf!rd4C;Y%afXiMotDs2WEha{#yyS5?yaVV{9Qan%Kc2hSEy`Mq&+R2g--wW=+y1YeG>4H2y(8-F4gJl4+g0z? zz&`GixKCe>YAvJ3a-}dFp3eJHxBqaZ=o;7BZ%gyo<3^3zkgo!?ZAOank-)%MBBH78 z4=SwF9asvC7EDdR-&__61Swg&@s*h;AL3)hsf#W?ph3g-?Pl|+Vy7EmMSuU`QhwXj<) z%sBy(qn2>WrsevpO>a0ZQLoYULY<9GR7mUU3Xs}6S$FRzYE9KVu0I-!Jg`SuF5+!A zxKX^gUQ@P8fiPjM%-#f0&gxv`B@!NM`MpWKj_s5+S0;HZ-9X5 zg*O?O?+Uk$tjH3FD6JArE{ii-ze7p9`e@qBRn0&tbCK-+!1%Rj7)M97N7R;}gm<%0$d*v=XGeHWdH)3auar|~E`LH6c(N^W<+(mxUCmf@J&fXiFl8~D$aO8k zq<_oripgbB{ccqs4vTTZblMy1@H0_iNoS3i9tEbLReZ0xrDLzHiDlzy`17`@Bi48z zdhI=+Bn*Gmuu}PS1O7%Fm#4fnVakb#2oa)2Kc%~Wbnd}aRg;MN+@`0E4ao2is2<1+E%gq(!~w?ry*(7fiCpj&eNcVNQK=9l(&4^N<`vTw zGQQigR@;@Mh#&QgkFBV2HQ=r^WdwwJ#THk2-om4P(XrQq zty9g0y*0FMlF3=8T;lTeem z??hyn({1jn<#0%t^TPqS(y0hBpdHuR&A+A0T(s(KvyYJjU5clF>R-LapQlfa-WV`)}D)ASQcLc5w(#&STS4sLPGc5f^j z<{10vK1;;JECUJU{`dC?ga*RL@+v*jHs8Y$ka+UM$Q!|rrhuT>E>O4k+W(bk`6+I? z$K!Npyj3lNpj(Z4=Pb{I;aJ+v@gh8og0~jSM8&-NoU?f37ly9NJv6qaZaT7z-AJ?TBjupp@csbVh@YB{u>n@i$$L0zm(K+hO8Uhcs zb$2ThQ(F_6Zp|GY&xEx3jf^tBy9$ijq}}`6ir<70NW;MGZ{ZG|HJ)wtrTz3fi}s28 zc@OrY)isckBZ26VB{`k}xsIkJo`(I5JIr>Qt92Z@3bKQ~|9H?0y0o;gcj2Ya*XM<~ zhhozd)U)hJK3fnh2?}y~W)CyPU(5b*)WNlA#Clcq#Lv~(11j*VU3W#&%YVx#y(uzt z_isXz2&4c{)MvYGuiXsy(jbN^5Y-QNQ$yofgE0d8i?wh*UQ3Ye276kc7m(oPZH6tg z%pT2vGQ}~S?R$X#bgAE(2qop-X+b>$`mAON0bZuVzOw&*t2?(|uJ6NQ;uT1IWwKRg zRuY1K#T>$@s5nl*VZdCzRbq=rS6u~y*0ZYn1;`$%XrY;W%GI%#cC{-1F=vwLX{?4| zd@PvB?`iq!Jd%T-ZnahetP)u~^k1yN!&`N!=^Cj-0t2#9x=|`K#Y84F9)ag>`k4-` zsm7f1jg+UCglfdW*a=Vj@M!84xN*zpLgs^qhsLZ zUFCko$h1PJSAIVx9=|7Mvn}~%W%}B$s`fsDjis0{jGg`CwgqM_arQahMrBcj9UtG4 z{nd+gG8^YpTh+%;nF(7D%a~@kaOrRr+vZwS8Z+2vEij(2%dmWiDrs;-qO=yf^I|x8 zX6QFuQq;bcH1bD%wNTjwjaO&#j2&3pO`#MWo;UN9j9?d0X|Qk5bYT{TWrTmfwEIu% z*~naCdSESBMp2=zj*E8*{A1Z;1lxQ1@Lw*oFqolLyU`V7uqaU&lsctfrs3*(sZQxr~L`k7-4F znXqv2VQ)q@O4}vr#?^-shHOM_9KW^d0a{ur!g8 zk_O}w&Kz56$d)|o;jiwIfpC<%i}K}7pPESc0l~r<d1%1<{F?+nUXHmoEYPIGGfIzbq&d!#?0P2_Ri83l14!H04;CXwIa zK9~7&McHL;=@sUK;^tAmrG{jZ0m>shNkm!kU|f4mRXI6o=paHY{g&P$!ie|NY0RVC zP(+;D^oES75G)8$R?G}JDiBa1iqurpT%dg33t#h^q<1fbNKHq33*5X7%*xb|~n zEzR!U1os5EghqAy9e*}=%5tb-kZMX^>~>28AI9BF%6SVQw6haAKW?=|)o>H=_*k)X zdI_}2odM`*5sxrogDU~-@#<2~cV~4J#_(klP3Z`V53Yim`@xe*;yIgqIAKIw(~EMb z0!Qt6nA%SWRc5|T5S~uk3eFuAccCi<;9_3Hql<1KiaXel>yrKNCcnGCUKFMw^>a$A zj=ii<1Pach`TC)qxu6)5&Gp21-p`9&{*WN5@IMo!jEqcrh#y@iOhXL+t4r+EfaWYsWQ_FZm2THRRm zVC3`AE$c7KA>zJzW3OZzeyyqxazt;OKyH8qJ(%q{ou&e+U%$_v8Y$|C^$bAZ3ytD0Y$$i3$ zQAnL9 z>kP)JXwEG^pGFcdpMs-k?kk1x(%EH}iZ^y7JU4Jpp`V)Riw4v~duPFp1?4oIPb)W% zQ~hmgC&;1^s|YyS6dmEN(Bk4JlG@ls{V96*H7vo}%@KJS2NA?b>^m-b&pf{O0Ad4} z;fi5r??0Z4YS1Ed8Ekxyc6$*LFTLl8KgnK{$r=w`Ij}N@X)FU$PHb6GCVi z)u@!ovlxpsmIzkx$04kmciBG8K{HpF4|)lSngLM;r++Vc0_;#Z8Liw2#cwR!+vwYQGaOF z|JW;J<=U(@^wq7yC3A)$Rcu+7=8igiJlZaBa2}4g>h?(Q6gM0u<+cp4Wa6JxZMab`3j!ewC_IUiQ1W38|d}@=Q`YDFU6NB2Owk3{f zADON)<#k@=M^I=w{!=WiG?m3AyLg8H>W7b@4RYKtL>NVWf!Qx)ecJ9b9|ZD<-)kJpK55@8{{&s z)yNXbGq8plm*TOLBuQvI6t?`nm*;r46s59T`hLU;U*9k*(6%5#iOdGhE_0!Nufd9r zQ&lyn75UPuM5qwNCEfRjv#0ib(eHiRPwbtWO`f5r_n)|RKCkvk6Y=rYh8qQwR^{cUo>oEIwCpIqCZ|%-IyQmiKUwFC8~P}MZQhf;wvK&7CEfw_!m&BTLKegxBKxUqP9%zC zIBpqk_Rz@4U!S#Yyc*DJ5FH{H+Kw`J-MG!Tt=g)3f3mgt?=;geu^^>IWi&DdPNH$z z&9~7J1{B*uxn|t0aR|gf%>fVK{n~V* zFnXOLKFPf0F?Gpt{<0EmI->4y5P6u_%w@MC_qBI7^L2B&TWzGK1C$ zVoQ=zE+Gvi$E-XT$3!fW8|FRqb{ZJr2SRd8E63)MA5EawxJ3u9A530&`Qtyyo%NQ) zvAiIf<@a>&vUZ7evR?vPDu4VoMLkKJn2}J=L%;+6YBh%lhjAeinT_zg5^U2 z;n13KwPHj#8}*p&1s=2jGKMy=08@42OX_8Gew(ls345{JKuo;s=TKSvf7a0l3*tfT zh0RIYeleAi^rry#W8pHF5dQLaX}Pa3S{8#o3Tk9iIj(5XGQ@rv53_O=WhfQBJH4z+-}Lmu;AY+svKg?QR_^FpMj9IH#Ajx+5u_nfS`4yiPStB5N8 zUE7W7%Zr{Rkn=OW64B&y%zd+(e5&3EQxm9{;-9Pfch&3vTQ;cwU+RvN^GDzb5dS<2 z;P3eVf7%-VPguZL8iH7VwEfE1 z(m~wk_;Un@V4QPo>z(I^ycpnrI=m7pd3-aU&W z8#c=-%V?RuVn`#L!Z)HA***_KK)-_cIc`_geS9Q5 z^aNG%-7CQQyDOiQZ|nC)kv{RAy1uwyH!j%^sKj6&MXG^qw6$EU=-^BoSX@aUl8g1?tJXww(54&T;Hd_&;@4r;3zPM_4z*@ zBQOzqU`A8D?NZF6Py%)-t|FS<<7_k-*Id{ZK*sgUcMaMA8xM)k)E+C*w!(LRaX2Dg zA+o}{y)%cnKR#8%L1D@m&Em@@>9x?MV*PjP|2>^LUIU$ln=8^2ctwIKqcUWuNxps$ z_Lx^I5;{{G(c&}Y2HOKyH53_shIiX>WWX(nw#k?5Fp|IcR+;pSG=aYmy;9G&hVC%S zf=uY;Hvbu|7Hb1(zo6d{DX)3GCZK-QF_=SSWb!-%YS75){K zBbg&jcYVW=C8n{f$jzdPnr?HoLv|g>N-*fjpJZVk^|l3`o5&4e!(W^@xR=)(@)?%Y6ods;9jx^Ljfv);D+SLU#Unk+@WBMR(4;SEOq_|QBC z&a%=b>>{`&*LNI_9Dcrpk`A^EeeP??7I#F*R))xb#@MneBFj?7!ap|Vmum-CyLlcY zBuk#(JW}@El#jIZv>zI$P8|_xL%2JJY1iqR?+UW}x&HurkFwyS=pyun|DeQJA!<77 zWv7MJ`wxHJv&@>vlUIxnv`zSl_>l`)bHjP>W*vUHZ4`qK1^h~Q`nX>4+gANXLD}bX z?w}j**6XzG;Yg0jk_NEN;a66ky7gV^p^M(*a1MTMO%vMb^N;Zybo)VK}ip&@?`9Cn3;nXmD~5&`|#X*`6!?R zHi{?{>E(`#Xx)~O^a|Oy+y=e_E3%b3q@>Wr{&H2*cOl)+HE(EU8ESJ^!y5>E}pSIs*vu6C<9^&n9$OWCHQ>`EQRqi67S z@&)SnXi_!l6tY6O_mhn9OAG}us_T;|R-6R&{eeH^M;V!DUZ&;orDPNC>AvhQS#qS@ zL3||K6KcJ%L0aTADSD0GqE6&`))%3vA+mjX2fzCo1j^$3NZR=JbR-!^gQ&e9^7QWI zf@ev9sSonV*p+Cg812t$vPrIz5PC&%1K;pZs>&K_>qTy>PiLfXNHG|Py?lB-lT6aOmv``Owr+Ig&OdPccfARJ8-_kCR7)F28XU|JQOlHO=yC)mZ$r-Azvs^%+@fz&{;>SC4{1X? zD$MTFJNqPn*W{~!UcMe~~K zrgQMRcWwgksYq1H+u(ZVln}hok)Xb`kv}-BB%b;^_3jaeRO^T(y+%0D*N{*eMB&@SZM%=@ zZK1HvJIZ%nC3a@>wqZ3>&J)|=S=vP^${(_DPKfvsVHiiSC9F3t-eU1?bSTVOlG3d<2?Eu%D51oQV(C6FH6V%u$uV#JXCAov8MnBt8&rTEXv{maP$pW?d%NAtev_(@Z~2&JPMo|k$qB5My;A!t-<3fBOzOT zKC&ZLW)a#8^{;4?9r41*r5)hbv=L2_9T&W#>3=KnGk2Bnoiu)IBzPK^uL;J$S92bX zIgTaZPw1oM`lHvNwxV@MMsv{Z^FMGE#OH^x`sWPO%@d6YL*nN8tjXDk& z$F2w>6*_fQO(=aXcQXtDyd$_4&pGQ*_z1DQr-$rfqyH&Qqu(Jp zMeqUU0kUDbsW^x72lQ^*9F`qY4PwUshR~FbR*@>19}fFp_829N4RM|1&0&4^U()?e z7Dy90t!20WOPhj8foa8DTJzn1wv;f$nyEJuHct272l4{#;QyB;gOnBv$i}NPFc1a2 ze<`vV{=2+*WO95?6;=$8fb5A9~GjKcpLP+R|V_+G`N?+zo#50`Ur!*etd&{p7|bX z>VRBMIWXv_oNeoF)TsvDvwTFVL;J|-o-jcD{QJv%)KeDLaiSuN{|x$Q;2IEKqJn=T zBAtjZ=>PxyH!$eaOSmX|xDXWsf4-*ipZrZ{+4o!mtjdJa28QI|l2lg3*&oX*1gRUu zi`IQ8l+ol{EozI2MBK;bYqkQ!pkM$~%Z@6smKoyZyJ75N^cq&E;3HBq4_${UDcwb~ z+}HI{l_P_I)hE-fB%>Sgn9*WG{`*bQVuSZ}IX_Bz_+Kg)1qF{8m-NB^TPz_3PjT~{ zvwYOyG}`drlgpXJD5h~gA3e;9s?}sPkVN03Nu>$hKJIH}E`a7p)Q(f_JaiAq3a&LH zCuYJ|z=PIPpv>(m~j!Y9R9%{iuo>9Ur7TIVWLhPOxZJ z?ESJn(B$e~KHRqE3pz~NwO6Jr$EQ8NVWF^r5Xa{bTZf|fLk zdN}5${a9{U>qYILOWhr@m=k6$1eYZJ{-nPz$A9$blA}TE@^o!Ph!Mldsvb(1$PFGa ztpEG+0bTjARMQFb-N9x08#MxgOSEzF+h!ZGZzI9&ths(kue6@BDmP^w5e`;Sx}QJa zl9THnPvLJb3K#<7*d)}gjOP)09LLn%x?}>Pw$nM{UmyHSa@In?FNnLC+>Iu8T`i;M zeR8QBS9I=;X;;lCTxN^_py^&mRS8{={FQz7Z|M_ZpjFJIsNc=)Qjk}HU%R=Xe@`=q zI1t>Mk8zB3+- zAl+&2xW|azbr8%g=q(|eU8DT(9Z5mW@7Hh-zDG54bnlDT{-$*%eeaETD$&h;(Esk= z+JdPJri|xhhx}v`=MCzmw7bwG#$Sq4+<$3Fum(PBQK?hI&-GA@YtcILy2)-o-oPJ= zk@$=;&>$33>~P8Ot&A4=!cYI&3AB``10{09j#`wLISREaR9!B1S}G4Bl=DNa?gU<% zxx)OUB}keHe5mFhnyLJ+_YoE^Eh4fbnlnoHl8xe|_p8ngN@bF3z(oJ6=esbPdO}fD za=d^03vE3rf`~=vZu4B>5n&JE(-XEbbrXKn#fN7R5z@P|y2paRDOKlaD34dm)%bH< zz@}3r|68GIs#!8u+17r?4+V-;HNm;=iqZ`AEb+N|hlObYyUD%?gw@HT39uwQu7Ht- z7gWXt9h+Qxi&KI7<`a->rUKM}T*Z%EpjPFBte?iGd0Un=C&gWH6uFVFy`Jt;(3M@5 z?vUPf)PAH+ekFhqYI*}@@GwF)cXxNukq*o=qki`i6ratDKYw^Gm>_-}MJD2!PbO&p zN#ycZ=rBSlJ=2s8UOz2_v9<1E z9KP^ekoA%`^y%@FKkoG#U`MA6?|-?)WP`;z^XQTsD*N6jOr}BUkrGhJwUKb>ymUMV zCLLe@zo#{Wfqy5mTaQ++=(sYQqwNSFDnlbUa1R2|J-S3vSbM%$LRt-*#}O*Y`X-|3~fM4*!yRx zt)&r&*nEI~h-TT*lbEZ6{vH}`iwu~ea7>ZjA9;f?K;9Hf#9^>b0gK-fCaRMT6~IgD+!sX z?141PVHSvz~bZtsu}EWmCS>&=|t?L-;wYj zYkzoj(DY)UjL0TF#~1zs{8rD$SXq}hY6t@8*TrP^y?|Jz#CF;qnmzy=Z_B{uH;Rx~ zboQsMbVatP?Ij%~y_qVtT;>S`ntQ9NwsSt`!-u#te#iYRR{!J*0e?$y6ia4%+Q7U? z|B>g787tF2hcK0locLdW2DYtZ>yg4ltGC+;=%l9u&3!7EyJYPuPRs3WSHPoIx@V2T zcfI0V?Z6i!>l-7zf}Zrky!M;sbfTMD5MGA0tdKO1v>#p@(m-Pk&oy~5Pj|9DrL5rm z4nt7Z@%R?Wl^!*}m1@E2+L6!su71-HTFhp4;1Z?hPZ#a?wLrWye=!NG1V)#}kt>AI z4A%#!t2Tf#Qh>s#)n#3>cwF$!j1ElPX0F!K@)Ahphk=630houGLwEj(46pI4O=P~8 z{5{FOx8K@b4!rkYU9RE_IkoQlE4}5v#cjjb?Z!kDoJ*TS=pH-q5*T{I3gB`~1+o<} z7NOd0F?2c?-?i9qqsOtlEgzQ=!TsSKGU3kS!sWNpy_l+-&<>diIGJYJ+x2C5D+PtW zq4)a7!od5xsrXxkVtII>4~gs-U_ntRCXi&ihz-6Ewj9xj+% zc$FiUR~J7u7<8L9?;Prk^5o$`;V@fKhMAtLy9YwD3Cw=v9@*boEZu+@v_<>l zi{#?X(QGG`acql{G~7jP+d9$Aioxwp;NkOh10aXA2pkHXeHsZjK=+v6IdAvBp09WA z0FJ_W;f8s;vaR8|8gZ<|R&*+?r*U##yoHl`TZF~ABUJn##H2u8`ZDZA5O@XoGy^m19&&X21SPEu6u7DzxV7Tz9FA0b#H+zQ$Wo z%1yfLl3#Jr>iMK`Ou6te(kQN^LZ~U9PS@eO{dcVSOSWrg%(Y z?7yYPZ1EQ`gjbZf1PEJZ2lCvtu!D@x*m405-K^;0fc z*_)r6=H4_m!l!(vkE58t1{oHC_$qbzITi~nd=qWPvPPb;%l&HNxX%zcHE{tkoTZ?>aA>}+gku5$n#BN5oKfxqcQz*!^7V)4y*tmsUO+A~+AhS!x?HDXwsZVqwIL4$O- zFI~}KkZuLZ0=u;VLJAev`Yl50F1@EM@A+$62~1ROKfqtXdB>2)u)&Z$U;&ZJ2HXEY zQ%`ymcl1OtML$?9#uE0Bm@p|4wcz~4-@zRYQC^f4VO1`H;(d|=dW=J&c8@DcQGE6( zU7o(Qay{^99uKAtIEkjOIK#%9Tqnio_OIbSV>M&e8cm-#G(T`r@O-x$OFdElz77!^ z_FD?SwSv;fD$1CHAty#eK3~To|8{a}ZstC)f79(t1QYIl)ab|ZDPhN*l3AeV=Akz_ zd%ujPtlOjaENf);e0XLwZ_e8SsNTPomO=0KN#l7Y3melQ=sVA0jZMJm){E=&tToJ0 zguA+p_tln1p|{4=Q=>|tX6ndo>u9OOf&i7tLl*jpYH4d9r~)M5N+4c}4)4@`ww0Z8 zk)uX?-j)1T+6W{Vn$Gp(2K|wq>ed zEL7w1v)ceLaG0 zdp;A*%1R$y%PW7hJ;*Jjzg8vaGa+m|iSs8awfymhECc*Qjwhn!Af1MqW+o-*{0)Ec zE!V3BMoDGi8NLGo^|kfJN<9;X8k!uG8x8>;2)#)!`X>}zmgsPFi*!Ur!ffDk^c0$0 z8Ll3Dhp~uPl8*+u$A<)@p;t@zDwFmtU!ONT3Uz@E2x|D+@E0>p<^qlm7X@&+HUmG` z@KHhML-eu)yz@z!qmbO?UnsZaLsc@DrC_(L>N!3J0VStilk0jb-~w@X@=?gRiV~$E z*E~iyR_xPa1UE=u4HL>1aVty}cFElaMhDy3YSW)K&cW%xR-FpIKzq(=Bts-3d)j`g zRQuy0C3khp)HeQdSKGrHISD+3K+{ z!Jy5@ef=in0oD*=Kfk#;nHd5OtEb5`h^_Of^L=R@YTY|1M4yOnwUQ> z#|@VN&in6lo#`%3q-Cx`k<7p7l|EUE(=p$MS-x}d!WXwqd+}Ui3V{SF1b1i!Kv;#( zvn4lxhC3?tq0t)Hii?{LWgDjEO@NmpVTHy5gqRc%wZ7 z3GZVIyg+=s=h~A+wtYQf)(fR#-Ks&=`|iPrj_UJ ztjqRCU8AOR5P@FklKlj(a+~y1 zl(kOM=;zkorb$jsWpQ2Wd zL3XZAmxn7$lO-?OMl!`-Dc+upN-a2xd;0*xP-`TMImJwlOlbKkwNyas#jJUPKH9KX zRnrDJ!ZKICfO_@Txry+_F=F!OKMykeGs}Jhl9+>Xy^tkrbw^z4o%>4=)P~$FLYsz5 zwaa$>oq%n_G>-aUf&#k!@;YWWx$3FHv>2$G`e~Ujy4G_mlA;|x^Fc!HC=>E)ED zV;maij*I`h*SOqpQha>wvFJXdtQc3yUIoZN9#nzc&0F+;iV>=ONq3*;5@$&yODx`A zcxyj20n1|5;~HUmdNWg*>4^xYF4-9{o^<)i*x$GiArj=ij82@W-z+vULwV6;D4BY0 zeNZk#c{bY4z&27rk@ch((<>{X2UUM$EdP|hp+8wSbHvApKkb&mEQI>u{eDdKQm3~F zgxj$Q^X{DYW+qN4d>BCy_~*)qcqB}GXJ}I{vwBS-P+5I)v*IJLRs3231z$`GxOJ}Y zGU5bTUpi>Fo4YQ6>)%I^7x?Wvd%R!zYxfX2ZwT9aBAqhkmlg$$)u*C&(>arTI|z{s zZ%C+9WBz)RAlZX{Xi#pEK}O9uG-DiSNQARAPxr>pzmq(=w%*ls@{M!EV<>Fs18sv~ z20qg{ffdTTa{WKk6~k&mix#hx__P|{S)lMY9JB-^c_}e# zyhYn?B3K@BBI*fR%!f|9N1jplMN4c4RDozqUHbE_@#z`f8>t%?%XJ3RCiUKpPVHAc z=dMSK3wjI&E)zq(c9?;kt^=2%zl@M;9x!JQnMvde$av{ElvOnF+5ECZMDPorqnXSa zwj6(i;6UTP`dXb#zjB&;j{x>*tsc6}=o%G*-G;SH=MOIBbFwz|K)5PDIl%BySW z{_$=7WaZD4g&dhQ(?Ruy>a}xQQmGYBY#9Hk@fJR786I!cb7si8+%I_x59;pixiM$f z7v>nvxE3noHX;rUOFnNFaFW{8d@lAK1eZQld38CJJNSXAHN=o0Zr0)jCa2*>2o|^V zMdRN-8Ih~9!|ZBE;n-7QmA`)<+Nw6k1)?^UwyuV)F$gHWL9)(`O_{o=Lzrab40a=mLDQ!a-lyak}7Y0VHF(u)JRPB zTx!IK#=a|LUI?#hNL<+Q0AcTQc&(5?y$4Tl%kh<|6f`q8Am(cY452OxQCP7tUZgZpt*)ZkXJUWu!9d9NnN4jP0UM% zy8|QQTO8xeJZlu)`z|CJ^x3887m4eh{B|xK3(6H~xi{_4xVe#G9$&Xg9Xb z>|7jIL5P`oD?eTQ5eK5So`Cc!q#c_pR6R&Xw#4AqwN5F)b))qtF)qab+W`@51U8@O z8xc5BK7#gz#q!L{G?r5*wDd_K|FZ3|mkXm-I}vg2>XD&4Pm~av&Z2gGbXP25hFC&i zW05lUTYtG<{xBxuH|3wcF@2`>N@667g>qbYMqSh%8}lRe=iD;oti0qJ3SDCxB37N% z2Mv;IVsI6>>(ix|ci4*$Zy$@2{(bOoQCAPT$oP;0$G>#vu+#y>&KN> ziNEAODRfZV1XaUIi%O|WE>mXhEYVMiPRU>b0uTRt_kB@J&R6{V=dfuhX+8ptj{4#^ zLC+gv7))W)yV;};yI!Lg^|hD~JTpEL9}-bKpGW`Rr}ztZSCE1nhYHyrw|^i5#CF;| zNV>Tre{xv0KWHSW`02Lcoj$B*f-xYL@n;F0FJ>LMyA{%lq;HJ-2R;Rnf6$h=Di6s0 zupu@pMSIQ=KRwAIoH5_Y5efow&a(K_y zrfm&!5f%N@?nkVz%4^MI4&!K(YoQAvB(@*kO3z~Wy+u8~hf%@Ey5R}>19zBzu^2gs zK#*?fntSqTcwP9B%G4@tyYU}%MCEgw;R!~|9!8C}u*#eS{O=eSzRFB3A1$q-;X{@t zB>mS3=0@0yC^ZlMQ2+ahOF%kIg+y)RKaV17*}%p(XGLy7Pv|8>XE5;cM~y@s)T^is z=?X3fQ^j7+KG{G-*Yxj1g9A-cEHzHsA{G_r3?~;q{Pg5hwj=NN6g*Ihs{Nu`(f&fp z7-q5CGxc~L)eiD8x&}^}y#zqI{1oSp=;!cUxxVRJvVf69R|ulPFw3jeTMat7^G81A zd8YFXH5cuKb*0>j>y2NCr8(POu<*JooAcU)=&P2|NGrL**CWYC@Q9H#9Mh5D*=8fe z^<*F&@RXvr7e(`< zFS0;M)@s3F`?_=in5>5a?@nkCWguN+GK}rab3T zgMAz_zV3*#{l$en3f0cvrcp!6B7xH+kGw;`=a_j0A>jV z2p@GdH1PTNhEn+vfu=9wk^#bWZG5FShy%LN;LwKPIdHJCxm*waePPg{p*U$mOB9?$ z)Te-P_wW|k%vu7G8rnl{U2LejSCV$0A1g-1mvc&OW;o{ODKPFnb(o(lpyD+P1C&d? zsxMz=L9mnF%j_U}z)|_-8`Wrk{}EvQtWh3;45%#_1+DoOK&4NZ)ibu%H#VB<-8M(g z%nAxVSbw+%WBUui0?L%~Iy5-|VcAK8{oL=f-MNW$Kt5YAo~D^Ds}HdMjaOi)rC6-7 z)QMek$~+l5?8!xkyMnnV#kj*w6l?>zV;JCY{w20GI@YfNwUO#JTK?Do?wB;a%NAi~ z2YCiXuJtRBqkVOueve0QlrCWV9$^(mG(FOV>Rw^^j89iOI3Fi}{AdY{z~&XUi}~09 zNWk1&Ssxb=S5L!9f%v=0*RaWTJyYcQV``CT)h6HD1;Jk9?t-UE5ZPZ za44a4k&(%`*)a5~PRLv*hkpEcr(l^J^O&arB$8k9Q66Gf>l*t{)z;Nj0Nih6W;+6> z2O0qm%k7JR594;|1@fs1%{&x02|$QVfVX~E7Bla`{_`U*5^x;cBDUs0HfjVYdX$YtIb_#~GBo-9nDQyAbP9pzA&&VhX{`O~q z=}T2>Bc1u18^0F7DzOKN%)T&#hlgFBn*x7x9JR!}JU}cAc_&-cpl^AbaPrM(D?|KJ z(HPLxr9>fM*(M6ECj7-Hx0usNr%t{m6z)cirZ7jM-rT)mNuH&)P;J^3n3v($QJYI`)aAYum7bL%D31PqX|F_&TaE&C8_n1nU%{gfpl8+`VQM#Dz3V0_~#5{JLRfP9LC7R|4a1y^P zB%seh8TY5JtU)~hZKjEkgG($^Gcm$?atZLw7(72V#W#ny^OHH%s-lLd22l;QhXtVV zhP(qC8u}zO{ILuXklE3|{m1n2D-7bBRS!sWGK-D3s6JOANQ#sAb``r% z6_p4GTHj=Yq|}RK0wZT@yA@?KhMfdXpE6=0gHjXz&o%5~da+^WM|B<0U4kV{P{Jcc zZw*tf<@phZ&!94ohcEVeKo%isU=$KXDeb3T$;(NB7icJg%`8>*mryZfwuCXtQEELW zg5D4HS)KSYKbpJSD7(vC`ZdMYk3sv?731gIJYAU-=&m#~7B8}33Fu2LqPM<ryw!>(WatXCIOHNet&MwT#4#T+^v!hQzdK9`tfQw#R&(ewR z134YV4~!j<>s%{7qkCozk;*xQO=svN{%~KLjGhJ4)qvm&Qi1=2WD@4~(QhU(T36!)qtbn>z28r#5jGeb zc8ARsES#6BiaH*S6iG1>J{!Dy3uCJvNEHV`M8A0j2UAw3lgnE%+JNmdrJHrw1A=0~ zV5>e;>h96~NxRKLY`*uP7^$!{t)XkjY&TlyJQ6xImk!z%FCl(kdwlVw>h1|9WlLX| zAj)okeRBxAvpetURXq489voq5WV=bfdk~GZ>?M!Z25IKeV0vK+QIs||32tFx*T_Kk zOYobGIaiHZtt4v$gNlxNnKqzHt8VeFon-3ih7f91?N5!x*bt99u!x81L5#)~q| zy8mD#=<9O#k$DA|*T4eLv4pu(CyNOFhfY226;(W{6iCcL4*jSsTqcfw$ z@5*&>V4xPXM{3vMr~|Rl40WO>tg>O(j6$zYy;dn!%Gen)DPw~P z!cG7Z<8viwnq8Uq#Lm?Lx-C*$d9ADbT(f)GKeR{@K=rhPPUdEwdSD25qWVvLave7*Hx%gn}*l4s!H zP*lUb*t~HeUV^3Ymt@0G&j+K>_1vl5%E51!l z%fbe(X%!~K;MugIf3o3i8tybCv@xgrB0Hxo*6c247#U%l!kioaTgG&f_u$u?iw=|N z;iny0zLke?e=ANKNrUrIdFX7vA4wbjewNdWKT+LyRWtThgI}&njlh&9hWVuY_wU~k z;o;%#SeTf0<_q|7kMqddzJOdR);04BZsY-7#oltqJ!@ZdbTlp|ZsT;8gs2`*pe!R@`pU!bqC!7 znIL1Uo8z^|-#{Zjn1d+Ti%9q}SpX7F*c~R*z@?!PQ7*8o6^2^L4Hxk2Uu3#-l`F>J zW}6wow#2hHpHQ8ge-{u?Xl&n-Yf@MIqu9QC^PT1^al&=3xs?Goo2cpCz2@E3ovYM2 zuUc`S>Y(N-5iJ%MasZiH#HHmt#XLm2_l8F$plK_xOq3Dt^swN?%447A^K*xo+CWHL z#AV(SvW>up{w%5yFD>MTN837`^A6U2!5VPGNf>bBs;y4{&Iom1-bGOcHqBA02@4S6!|WeSqT`z~(X4ne zcZzAw&&|LQ;8K)Ml>p(1$O>UDDn1N*c7-eKx-I4!*l!*nwb0wwR=fG;RV_NCRSE1qV!_KuouXR|Hs}IqrO~|Xwpt&F+|^T2`^N9h zc=%lzgu0+r!fy`L7Atxvn zycc5NysWSDvh`W-Q2m|9nsiEz<^+ zXit-Vd{UbHJrYU(y=^n;;9se+M_chQHkiozDOF_$Crb$~(|c*rTO!8yp~sJqv!!u$ z0x;Y|3Zc>OkY#k(Ac4~C=hf7h{Y~1Ksf5~y_MIgwo7yb;u5>t0xbQY4{mJtn3wjM| zWI;F2pvs6@H+^P^3y=a)2_+4%cA;0F;4tKMNUOm4=~B^?KGSyU&pFm; zlFO+@6{TN6-_phRbvj8s^8npH5)yY}bE)F!aatOE5mek$CO3&nA$oly5v4NUs05Lv zklC*5hv2y=G37*wmV9xq!KObH+2O+!eVAJp&p(LGO2YUHSjNq_%MBwfzx2|#-cr7YtD=?|hEjNxJU5|y1fM@f_2>ZNA1 zoociz%5gi%Z$=?8&@RhIG*<`4{(J)$phVxPq&aJEI9K?m{n_;aFezAp8(~8{A+4L{ zFCc#+M5pY}XMGG8=Ygg`-@MllG?YbUXUs8LCZX4YfFwbHMPp7$b3{SHTE>rI+S=C0 znnWrT{Mf9=hF@xfY`Ro62WKP$z?cU( zBSeSYzm`aT7;U8AMn@*&X654BNkRs9ou@U2h?< z&49Nx918Woh>nSALmcP>l7!G1uo+j$$2?jBNYpmO!7;vfSDS615YPexO0aCt9x`h$ z!)Dk~GLP{-*pvs~fb&WQu{4{6JojH8l-G^8+8PDkx_gc^I4rbE`v3iMi^%RTw(=5u zJ}|2t8~JQAdA1f8_qD9OS7`;ARnwl=J>BNG?>FBQ#`-E4jWzi-2l4HI)-IxU2C79C z?7{WUlduRBG~82l;T2S;wZ6n9kdv&^4R}btS3d=Kb&o#?7zyIU25%es3s+ayR+-R; z-VYu;SOVGc;+3j^Nhi|5#B(Y4Z7`Ph2Y}vl28b$d2jE#D4GicAEAXNZJzzs^h}kKOHWe)VR9uHa3Ih*V2`&jpMl6{9^`C4MS$k1y~4O-36YV4-by6`MjpBLF(=Nb zPw4M;CGxp@kOJdY+JFw~Z&}*Ep%f{qAK!u>hcgDJQR^%|$t$2q!x{c@jyR76fxrUT z^9j>%$qmx>?-SUX)$RvNi}+*|X|;GNkydVHKtC)tA`n%NGrZOS)JdD2-m*aU^wZnV zF_v|HV>sQ~ZDWX6poS*d7~&DDkY#^#Sb@Z0f0u=QL{b3uT10a2-%vnvKrnd@#uy}@ zEm*tD9m}&{%XKPH_%Yoi=B-Av9Bh~yYs`D+cc#klM21d5rcq-1$tdVuqY7O;3)O=9c8yw?l#%uQyYUp-Og2dAaF?c9f`hxAB`fJ6zR zBJJjnc0OHfLjR5w0?R@$qTDKB?_(o+W64%CtRiu#b|WYpoqOr4mH>nh(M51;j*V?} zT(kpSPJTi2WxyPa*qr2!Za@5ng+tY~3tAvgj5brPvuaxQn+K@_Z(PcZJK8O+ty}DW z1P`_U-Qj4IIu^w+=x7%(7Tvm!pblwB+Mkg9Cvm2gLme;VClhn5nGy00Ql#*8u4D14 z-&<^%G3RkK2ncA8eD<>fOs)m4D;6G}QFqA`&m5&i5PhG`w)h^e^`nWE)|`+B5PNy4 z@3E-$5oSwxkiSB9(fXhRZgo5!Y>@RWmh9E&-H}M?%|FDhtA|Lhk}ZL1lRtVo2l?8R zsyZ)0a^CnefY^&%iCwm)jpp(04!R4h)*T^^By$7=LU2=m%1xo_*vZbT!rO5gM&GLP zv7)3Q=r-Rzph{~e!Nm~{EL>}1u)?A(oG7$r|;_v z(mS1ql`{Q$k5(6(b|@GBmzJy|pC}l!Lv_4_kG%{%{}WOd)Z_JU0mIjVA-QpMwnSU| z)xtYANc}&P^VFFg3^_DWCr>k3YX^j40m(8A8W6x;af8d57l>VFn3|cUWrYCx^VV8| zW2l&!T^7JxZe6fa_RVOi9fwBpC!*ySIDu31`*I^XRy_^cfaqqwMeH(q?E@08 zQRx}%aPQ$=@zcz^V)(U%Q81zQam1MjZT1)M5dbHog;Vr=hrb3u zYgq4->_!P7sqI}W@v40yoP-{{x!{XOqxR}-0fd`Hm8VZq-$n|VHre~0?X1VU z6+c=kTsP|G1$|gNwGC=!o}=k<{Z&JN%%k>k^#w;hxpq~1nd!R)C}uSRbU$8QM$?;C zaGb!><&sUOUNN57v**=+2@V3L_cn#5`M1M$~&Y99wS( z<0#rtuy;j3ZuJMW0CDg{kVAeid3R>sfgjQ=Rim}YL7e7-+B<2zPNFcGuSQy^? zQL}LqFP@-z18TZ^h@%q_4PyD@#NN+<9_h>q)-|ZNjSA{}`Oa0+<>REj5a%rtl5M}H zC7=WdP2}0!Dx9$D9%kN;;cjNNN}Qd;9&K8D!qYAi;}+ny55LD$TD&F_5`glX+AN>mLTRnoEbzq zI}23Df&Ei@nVN)q z4ON&;`E;z}9fQd4oi~091Pl`xGKhqvUM%-GDo7s#SOy+1>$uS88~qB>c(4%ecG;|1 z%lHDsjZBDewAFzf7D03v>Fkp6)B!EOZysKJN~19DOUJi{Fz38W24UL7aYM-%nR*W2 zhg=KckS{M2fKJR)_BA-2(|~ZS>h!` single process running the crawler Memory, SQLAlchemy -Distributed spiders :class:`Backend ` spiders and single :term:`db worker` Memory, SQLAlchemy -Distributed backends :class:`DistributedBackend ` spiders, :term:`strategy worker` (s) and db worker(s). SQLAlchemy, HBase -==================== ========================================================================= ====================================================== ===================== +==================== ====================================================== +Mode Components needed +==================== ====================================================== +Single process single process running the crawler +Distributed spiders, :term:`strategy worker` (s) and db worker(s). +==================== ====================================================== Single process ============== -Frontera is instantiated in the same process as fetcher (for example in Scrapy). To achieve that use :setting:`BACKEND` -setting set to storage backend subclass of :class:`Backend `. This run mode is -suitable for small number of documents and time non-critical applications. +Frontera is instantiated in the same process as fetcher (for example in Scrapy). Read more on how to use that mode +:doc:`here `. -Distributed spiders -=================== +This mode is suitable for developing the crawling strategy locally and applications where its critical to fetch +small number of documents fast. -Spiders are distributed and backend isn't. Backend is running in :term:`db worker` and it's communicating with -spiders using :term:`message bus`. -1. Use :setting:`BACKEND` in spider processes set to - :class:`MessageBusBackend ` -2. In DB worker :setting:`BACKEND` should point to :class:`Backend ` subclass. -3. Every spider process should have it's own :setting:`SPIDER_PARTITION_ID`, starting from 0 to - :setting:`SPIDER_FEED_PARTITIONS`. -4. Both spiders and workers should have it's :setting:`MESSAGE_BUS` setting set to the message bus class of your choice, - and other implementation depending settings. - -This mode is suitable for applications where it's critical to fetch documents fast, at the same time amount of them -is relatively small. - - -Distributed spiders and backend -=============================== +Distributed +=========== Spiders and backend are distributed. Backend is divided on two parts: :term:`strategy worker` and :term:`db worker`. Strategy worker instances are assigned to their own part of :term:`spider log`. @@ -58,6 +42,6 @@ Strategy worker instances are assigned to their own part of :term:`spider log`. 5. Both spiders and workers should have it's :setting:`MESSAGE_BUS` setting set to the message bus class of your choice and selected message bus have to be configured. -Only Kafka message bus can be used in this mode out of the box and SQLAlchemy and HBase distributed backends. +Only Kafka message bus can be used in this mode out of the box. -This mode is suitable for broad crawling and large amount of pages. +This mode is designed for crawling of web-scale large amount of domains and pages. From b3247fcced05857b115990e79efd0c6d687d9415 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 17:32:16 +0500 Subject: [PATCH 213/273] crawling strategy docs --- docs/source/index.rst | 18 ++++---- .../topics/custom_crawling_strategy.rst | 6 +-- docs/source/topics/strategies.rst | 46 +++++++++++++++++-- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index a99d74b68..4c9f71c01 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -5,8 +5,9 @@ Frontera |version| documentation ================================ `Frontera`_ is a web crawling tool box, allowing to build crawlers of any scale and purpose. It includes: -* :ref:`crawl frontier ` framework managing *when* and *what* to crawl and checking for -crawling goal* accomplishment, + +* :ref:`crawl frontier ` framework managing *when* and *what* to crawl and checking for crawling goal* accomplishment, + * workers, Scrapy wrappers, and data bus components to scale and distribute the crawler. Frontera contain components to allow creation of fully-operational web crawler with `Scrapy`_. Even though it was @@ -50,6 +51,7 @@ Using Frontera :hidden: topics/installation + topics/strategies topics/frontier-objects topics/frontier-middlewares topics/frontier-canonicalsolvers @@ -62,6 +64,9 @@ Using Frontera :doc:`topics/installation` HOWTO and Dependencies options. +:doc:`topics/strategies` + A list of built-in crawling strategies. + :doc:`topics/frontier-objects` Understand the classes used to represent requests and responses. @@ -72,7 +77,7 @@ Using Frontera Identify and make use of canonical url of document. :doc:`topics/frontier-backends` - Define your own crawling policy and custom storage. + Built-in backends, and tips on implementing your own. :doc:`topics/message_bus` Built-in message bus reference. @@ -127,13 +132,10 @@ Developer documentation topics/tests topics/loggers topics/frontier-tester - topics/faq topics/contributing topics/glossary - - :doc:`topics/architecture` See how Frontera works and its different components. @@ -155,13 +157,9 @@ Developer documentation :doc:`topics/frontier-tester` Test your frontier in an easy way. -:doc:`topics/faq` - Frequently asked questions. - :doc:`topics/contributing` HOWTO contribute. - :doc:`topics/glossary` Glossary of terms. diff --git a/docs/source/topics/custom_crawling_strategy.rst b/docs/source/topics/custom_crawling_strategy.rst index d09076fab..dbeac5522 100644 --- a/docs/source/topics/custom_crawling_strategy.rst +++ b/docs/source/topics/custom_crawling_strategy.rst @@ -1,6 +1,6 @@ -================= -Crawling strategy -================= +================================ +Writing custom crawling strategy +================================ Crawling strategy is an essential part of Frontera-based crawler and it's guiding the crawler by instructing it which pages to crawl, when and with what priority. diff --git a/docs/source/topics/strategies.rst b/docs/source/topics/strategies.rst index 77188d956..f5743aa1c 100644 --- a/docs/source/topics/strategies.rst +++ b/docs/source/topics/strategies.rst @@ -1,4 +1,44 @@ -======================================= -List of crawling strategies in Frontera -======================================= +=================== +Crawling strategies +=================== +Basic +===== + +Location: :class:`frontera.strategy.basic.BasicCrawlingStrategy` + +Designed to showcase the minimum amount of code needed to implement working :term:`crawling strategy`. It reads the seed +URLs, schedules all of them and crawls indefinitely all links that is discovered during the crawl. + +Used for testing purposes too. + + +Breadth-first +============= + +Location: :class:`frontera.strategy.depth.BreadthFirstCrawlingStrategy` + +Starts with seed URLs provided and prioritizes links depending on their distance from seed page. The bigger the distance, +the lower the priority. This will cause close pages to be crawled first. + + +Depth-first +=========== + +Location: :class:`frontera.strategy.depth.DepthFirstCrawlingStrategy` + +The same as breadth-first, but prioritization is opposite: the bigger the distance the higher the priority. Thus, +crawling deeper links first. + + +Discovery +========= + +Location: :class:`frontera.strategy.discovery.Discovery` + +This crawling strategy is used for crawling and discovery of websites in the Web. It respects robots.txt rules, +follows sitemap.xml and has a limit on a number of pages to crawl from every website. It will also skip the website in +case of fatal errors like connection reset or dns resolution errors. There are two settings used to configure it + +* :setting:`DISCOVERY_MAX_PAGES`, +* :setting:`USER_AGENT` From ca53383924a6f063f6b12e1fe52891c379c8f89f Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 23 Jul 2018 17:49:13 +0500 Subject: [PATCH 214/273] style mainly --- frontera/strategy/discovery/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py index f9cc1feba..09c513dbb 100644 --- a/frontera/strategy/discovery/__init__.py +++ b/frontera/strategy/discovery/__init__.py @@ -78,7 +78,7 @@ def consume_randomly(iterable): def is_valid_robotstxt(lines): for raw_line in lines: - line = raw_line.strip(u'\ufeff').lower() # '\xef\xbb\xbf' in case of bytes + line = raw_line.strip(u'\ufeff').lower() # '\xef\xbb\xbf' in case of bytes if line and not line.startswith("#"): if line.startswith("user-agent:") or line.startswith("sitemap:"): return True @@ -135,7 +135,7 @@ def __init__(self, manager, args, mb_stream, states_context): try: psl_file = codecs.open("public_suffix_list.dat", encoding='utf8') - except FileNotFoundError as fne: + except IOError: self.logger.exception("Please get the public suffix file from https://publicsuffix.org/") raise self._suffix_list = PublicSuffixList(psl_file) @@ -173,12 +173,12 @@ def read_seeds(self, stream): processed += len(requests) self.logger.info("Processed %d, scheduled %d urls.", processed, scheduled) requests = [] - except: + except Exception: self.logger.exception("Error during seeds addition") if requests: try: scheduled += self._schedule_batch(requests) - except: + except Exception: self.logger.exception("Error during seeds addition") processed += len(requests) self.logger.info("Processed %d, and scheduled %d urls overall.", processed, scheduled) @@ -293,7 +293,7 @@ def _process_robots_txt(self, response, domain): netloc = response.meta[b'netloc'] domain.setdefault('queued_pages', 0) try: - body = response.body.decode('utf-8') # response.meta.get(b'encoding', 'utf-8') + body = response.body.decode('utf-8') # TODO: use encoding from response.meta.get(b'encoding', 'utf-8') except UnicodeDecodeError: self.logger.warning("Error during robots.txt decoding at %s", response.url) update_domain_with_parser_data(domain, parser=None, url=response.url) @@ -305,7 +305,7 @@ def _process_robots_txt(self, response, domain): if not is_valid_robotstxt(robots_lines): raise SyntaxError("Robots.txt isn't valid") parser.parse(robots_lines) - except: + except Exception: self.logger.exception("Error during robots.txt parsing at %s", response.url) update_domain_with_parser_data(domain, parser=None, url=response.url) self._schedule_home_page(netloc, domain) @@ -352,7 +352,7 @@ def _process_sitemap(self, netloc, body, domain): b'scrapy_meta': sitemap_scrapy_meta} if sub_sitemap else ( {b'home': True} if is_home_page_url(url) else {}) request = self.create_request(url, meta=meta, headers=DEFAULT_HEADERS) - except: + except Exception: self.logger.exception("Error on url %s", url) continue sitemaps.add(request) if sub_sitemap else requests.add(request) From d8c9c3a34be9b03cdcf7849a2cddb32a7fb48533 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 10:19:59 +0500 Subject: [PATCH 215/273] some mistakes corrected --- .../topics/custom_crawling_strategy.rst | 45 ++++++++----------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/docs/source/topics/custom_crawling_strategy.rst b/docs/source/topics/custom_crawling_strategy.rst index dbeac5522..aa5038a41 100644 --- a/docs/source/topics/custom_crawling_strategy.rst +++ b/docs/source/topics/custom_crawling_strategy.rst @@ -55,7 +55,7 @@ It has to be inherited from BaseCrawlingStrategy and implement it's API. The class can be put in any module and passed to :term:`strategy worker` or local Scrapy process using command line option or :setting:`CRAWLING_STRATEGY` setting on startup. -The strategy class can use it's own storage or any other kind of resources. All items from :term:`spider log` will be +The strategy class can use its own storage or any other kind of resources. All items from :term:`spider log` will be passed through these methods. Scores returned doesn't have to be the same as in method arguments. Periodically ``finished()`` method is called to check if crawling goal is achieved. @@ -90,7 +90,7 @@ Main This is the main cycle used when crawl is in progress. In a nutshell on every spider event the specific handler is called, depending on the type of event. When strategy worker is getting the SIGTERM signal it's trying to stop politely - by calling close(). In it's normal state it listens for a spider log and executes the event handlers. + by calling close(). In its normal state it listens for a spider log and executes the event handlers. 1. from_worker() → init() 1. page_crawled(response) OR page_error(request, error) OR filter_extracted_links(request, links) and subsequent links_extracted(request, links) @@ -107,19 +107,21 @@ often available from arguments of event handlers: _page_crawled_, _page_error_ a IMPORTANT NOTICE - The request created with create_request() is lack of state (meta[b'state']) set. To get the states strategy worker - needs to access the backend, and this is not happenning when you call create_request(). Instead it is expected you - will create a batch of requests and call refresh_states(iterable) on the whole batch of requests. After - refresh_states is done, you will have a states available for your newly created requests. + The request created with create_request() has no state (meta[b'state']) after creation. To get the states strategy + worker needs to access the backend, and this is not happenning when you call create_request(). Instead it is + expected you will create a batch of requests and call refresh_states(iterable) on the whole batch of requests. + After refresh_states is done, you will have a states available for your newly created requests. + + The Request objects created by strategy worker for event handlers are always having the states assigned. - The Request objects created by strategy worker for event handlers always have the states assigned. State operations ^^^^^^^^^^^^^^^^ Every link has a state. The purpose of this states is to allow the developer to persist the state of the link in the system (allow restart of SW components without data loss) and use it for decision making. The states are cached in -strategy worker, flushed to backend and will be loaded when needed. States can have following values: +strategy worker, flushed to backend and will be loaded when needed. States are defined in +:class:`frontera.core.components.States` and can have following values: * NOT_CRAWLED, * QUEUED, @@ -129,16 +131,8 @@ strategy worker, flushed to backend and will be loaded when needed. States can h NOT_CRAWLED is assigned when link is new, and wasn't seen previously, the rest of the state values must be assigned in the crawling strategy code. -States allow to implement such logic as: - -* Basic visit once of every link found, -* Revisiting by time condition, if state is coupled with a timestamp (requires minor modification of backend), -* Re-visiting of errored links depending on the type of error (fatal errors are skipped, and recoverable are revisited). -* Analysis of the states database to collect the state stats using Hadoop jobs. - -See also - -https://github.com/scrapinghub/frontera/blob/master/frontera/core/components.py#L105 +States allow to check that link was visited or discovered, and perform analysis of the states database to collect the +state statistics using MapReduce style jobs. Components @@ -146,15 +140,14 @@ Components There are certain building blocks and successful solutions exist for the common problems. -DomainCache ------------ +DomainMetadata +-------------- -It's often needed to persist per-host metadata in the permanent storage. To solve this there is a DomainCache available -at class path frontera.contrib.backends.hbase.domaincache. It's has an interface of Python mapping types -(https://docs.python.org/3/library/stdtypes.html?highlight=mapping#mapping-types-dict) and is backed by two generations -of in-memory cache with LRU logic and persisted in HBase only (currently). It's expected that one will be using -domain names as keys and dicts as values. It's convenient to store there per-domin statistics, ban states, the count -of links found, etc. +It's often needed to persist per-host metadata in the permanent storage. To solve this there is a +:class:`frontera.core.components.DomainMetadata` instance in backend. It's has an interface of Python mapping types +(https://docs.python.org/3/library/stdtypes.html?highlight=mapping#mapping-types-dict ). It's expected that one will +be using domain names as keys and dicts as values. It's convenient to store there per-domin statistics, ban states, +the count of links found, etc. PublicSuffix From ae138edcdb95623a4a41a484d0a9c37d317f6be1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 10:44:54 +0500 Subject: [PATCH 216/273] putting the code under name=main --- frontera/utils/add_seeds.py | 54 +++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/frontera/utils/add_seeds.py b/frontera/utils/add_seeds.py index d3952dd6d..f884bd000 100644 --- a/frontera/utils/add_seeds.py +++ b/frontera/utils/add_seeds.py @@ -10,29 +10,31 @@ logger = logging.getLogger(__name__) -parser = ArgumentParser(description="Frontera local add seeds utility") -parser.add_argument('--config', type=str, required=True, - help='Settings module name, should be accessible by import') -parser.add_argument('--log-level', '-L', type=str, default='INFO', - help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") -parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") -args = parser.parse_args() -settings = Settings(module=args.config) -logging_config_path = settings.get("LOGGING_CONFIG") -if logging_config_path and exists(logging_config_path): - fileConfig(logging_config_path, disable_existing_loggers=False) -else: - logging.basicConfig(level=args.log_level) - logger.setLevel(args.log_level) - logger.addHandler(CONSOLE) - -fh = open(args.seeds_file, "rb") - -logger.info("Starting local seeds addition from file %s", args.seeds_file) - -manager = LocalFrontierManager.from_settings(settings) -manager.add_seeds(fh) -manager.stop() -manager.close() - -logger.info("Seeds addition finished") \ No newline at end of file + +if __name__ == '__main__': + parser = ArgumentParser(description="Frontera local add seeds utility") + parser.add_argument('--config', type=str, required=True, + help='Settings module name, should be accessible by import') + parser.add_argument('--log-level', '-L', type=str, default='INFO', + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") + parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") + args = parser.parse_args() + settings = Settings(module=args.config) + logging_config_path = settings.get("LOGGING_CONFIG") + if logging_config_path and exists(logging_config_path): + fileConfig(logging_config_path, disable_existing_loggers=False) + else: + logging.basicConfig(level=args.log_level) + logger.setLevel(args.log_level) + logger.addHandler(CONSOLE) + + fh = open(args.seeds_file, "rb") + + logger.info("Starting local seeds addition from file %s", args.seeds_file) + + manager = LocalFrontierManager.from_settings(settings) + manager.add_seeds(fh) + manager.stop() + manager.close() + + logger.info("Seeds addition finished") \ No newline at end of file From 2faf2808d9d57dd2c9c330d9a6eaaacc55b81de1 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 10:45:53 +0500 Subject: [PATCH 217/273] is -> == --- tests/mocks/components.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mocks/components.py b/tests/mocks/components.py index 61255b38b..2e69847a5 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -203,7 +203,7 @@ def read_seeds(self, fh): url = url.strip() req = self.create_request(url) self.refresh_states(req) - if req.meta[b'state'] is States.NOT_CRAWLED: + if req.meta[b'state'] == States.NOT_CRAWLED: req.meta[b'state'] = States.QUEUED self.schedule(req) @@ -215,7 +215,7 @@ def filter_extracted_links(self, request, links): def links_extracted(self, request, links): for link in links: - if link.meta[b'state'] is States.NOT_CRAWLED: + if link.meta[b'state'] == States.NOT_CRAWLED: link.meta[b'state'] = States.QUEUED self.schedule(link, 0.5) From fc161f69cd6a96302364c9d8f17fceda5d5c5ba2 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 10:47:21 +0500 Subject: [PATCH 218/273] docstring --- frontera/core/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/core/manager.py b/frontera/core/manager.py index 3e499b113..b7e6a31fd 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -487,7 +487,7 @@ def add_seeds(self, seeds_file): """ Performs seeds addition procedure. Using file-like object, calls read_seeds method of crawling strategy. - :param file seeds_file: A file-like object passed to read_seeds + :param file seeds_file: A file-like object opened in binary mode which will be passed to read_seeds :return: None. """ From 8e13eb12aedb424ed758c659da76eb099b67e00a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 10:56:58 +0500 Subject: [PATCH 219/273] page_error -> request_error --- examples/cluster/bc/broadcrawl/__init__.py | 112 --------------------- frontera/strategy/__init__.py | 29 ++---- frontera/strategy/basic.py | 2 +- frontera/strategy/depth.py | 2 +- frontera/strategy/discovery/__init__.py | 2 +- frontera/worker/strategy.py | 2 +- tests/backends.py | 2 +- tests/mocks/components.py | 2 +- tests/test_strategy.py | 2 +- 9 files changed, 16 insertions(+), 139 deletions(-) delete mode 100644 examples/cluster/bc/broadcrawl/__init__.py diff --git a/examples/cluster/bc/broadcrawl/__init__.py b/examples/cluster/bc/broadcrawl/__init__.py deleted file mode 100644 index 679f7d7fc..000000000 --- a/examples/cluster/bc/broadcrawl/__init__.py +++ /dev/null @@ -1,112 +0,0 @@ -# -*- coding: utf-8 -*- -from frontera.core.components import States -from frontera.strategy import BaseCrawlingStrategy -from frontera.contrib.backends.hbase import HBaseBackend -from cachetools import LRUCache -from msgpack import packb, unpackb -import logging -from datetime import timedelta - -from six.moves.urllib import parse as urlparse -import six - - -class DomainCache(LRUCache): - def __init__(self, maxsize, connection, table_name): - super(DomainCache, self).__init__(maxsize) - self.table = connection.table(table_name) - - def popitem(self): - key, value = super(DomainCache, self).popitem() - self._store_item(self.table, key, value) - - def __missing__(self, key): - row = self.table.row(key) - if not row: - super(DomainCache, self).__missing__(key) - raise KeyError - value = {} - for k, v in row.iteritems(): - cf, _, col = k.partition(':') - value[col] = unpackb(v) - self.__setitem__(key, value) - return value - - def _store_item(self, batch, key, value): - data = {} - assert isinstance(value, dict) - for k, v in six.iteritems(value): - data["m:%s" % k] = packb(v) - batch.put(key, data) - - def flush(self): - with self.table.batch() as b: - for k, v in six.iteritems(self): - self._store_item(b, k, v) - - -class BCPerHostLimit(BaseCrawlingStrategy): - - def __init__(self, manager, mb_stream, states_context): - settings = manager.settings - backend = manager.backend - assert isinstance(backend, HBaseBackend), "This strategy supports HBaseBackend only." - self.conn = backend.connection - self.domain_cache = DomainCache(10000, self.conn, "domain_metadata") - self.max_pages_per_hostname = settings.get("MAX_PAGES_PER_HOSTNAME") - assert self.max_pages_per_hostname is not None - self.logger = logging.getLogger("bcperhostlimit-strategy") - super(BCPerHostLimit, self).__init__(manager, mb_stream, states_context) - - def add_seeds(self, seeds): - self._schedule_and_count(seeds) - - def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED - domain = self._get_domain_bucket(response.url) - domain['cp'] = domain.get('cp', 0)+1 - - def links_extracted(self, request, links): - self._schedule_and_count(links) - - def page_error(self, request, error): - request.meta[b'state'] = States.ERROR - self.schedule(request, score=0.0, dont_queue=True) - - def _schedule_and_count(self, links): - counts = dict() - for link in links: - if link.meta[b'state'] is not States.NOT_CRAWLED: - continue - link.meta[b'state'] = States.QUEUED - url_parts = urlparse.urlparse(link.url) - if not url_parts.hostname: - continue - hostname = url_parts.hostname - if hostname not in counts: - domain = self.domain_cache.setdefault(hostname, {}) - counts[hostname] = domain.get('sc', 0) - if counts[hostname] >= self.max_pages_per_hostname: - self.logger.debug("Reached per host limit for URL %s, " - "already scheduled %d of %d allowed.", link.url, counts[hostname], - self.max_pages_per_hostname) - continue - path_parts = url_parts.path.split('/') - score = 0.5 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) - self.schedule(link, score) - counts[hostname] += 1 - if counts[hostname] == self.max_pages_per_hostname: - self.logger.info("Reached per host limit for domain %s (%d)", hostname, self.max_pages_per_hostname) - - for hostname, count in six.iteritems(counts): - domain = self.domain_cache.setdefault(hostname, {}) - domain['sc'] = domain.get('sc', 0)+count - - def _get_domain_bucket(self, url): - parsed = urlparse.urlsplit(url) - hostname, _, _ = parsed.netloc.partition(':') - return self.domain_cache.setdefault(hostname, {}) - - def close(self): - self.domain_cache.flush() - super(BCPerHostLimit, self).close() diff --git a/frontera/strategy/__init__.py b/frontera/strategy/__init__.py index 1fc3b431a..8fe6f223c 100644 --- a/frontera/strategy/__init__.py +++ b/frontera/strategy/__init__.py @@ -55,6 +55,15 @@ def page_crawled(self, response): :param object response: The :class:`Response ` object for the crawled page. """ + @abstractmethod + def request_error(self, request, error): + """ + Called every time there was error during page downloading. + + :param object request: The fetched with error :class:`Request ` object. + :param str error: A string identifier for the error. + """ + @abstractmethod def filter_extracted_links(self, request, links): """ @@ -86,15 +95,6 @@ def links_extracted(self, request, links): the links extracted for the crawled page. """ - @abstractmethod - def page_error(self, request, error): - """ - Called every time there was error during page downloading. - - :param object request: The fetched with error :class:`Request ` object. - :param str error: A string identifier for the error. - """ - def finished(self): """ Called by Strategy worker, after finishing processing each cycle of spider log. If this method returns true, @@ -144,17 +144,6 @@ def refresh_states(self, requests): """ self._states_context.refresh_and_keep(requests) - def request_error(self, request, error): - """ - DEPRECATED. - - Convenience method, called by FronteraManager, please use page_error() instead. - - :param request: :class:`Request ` - :param error: str with error description - """ - self.page_error(request, error) - def frontier_start(self): pass diff --git a/frontera/strategy/basic.py b/frontera/strategy/basic.py index d6a70731b..b0e586f86 100644 --- a/frontera/strategy/basic.py +++ b/frontera/strategy/basic.py @@ -21,5 +21,5 @@ def links_extracted(self, request, links): def page_crawled(self, response): response.meta[b'state'] = States.CRAWLED - def page_error(self, request, error): + def request_error(self, request, error): request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/frontera/strategy/depth.py b/frontera/strategy/depth.py index fc1c99d95..8c83b852b 100644 --- a/frontera/strategy/depth.py +++ b/frontera/strategy/depth.py @@ -28,7 +28,7 @@ def links_extracted(self, request, links): link.meta[b'state'] = States.QUEUED self.schedule(link, self.get_score(link)) - def page_error(self, request, error): + def request_error(self, request, error): request.meta[b'state'] = States.ERROR self.schedule(request, score=0.0, dont_queue=True) diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py index 09c513dbb..0a615938a 100644 --- a/frontera/strategy/discovery/__init__.py +++ b/frontera/strategy/discovery/__init__.py @@ -265,7 +265,7 @@ def links_extracted(self, request, links): link.headers.update(DEFAULT_HEADERS) self._process_links(links, domain) - def page_error(self, request, error): + def request_error(self, request, error): request.meta[b'state'] = States.ERROR # if redirects, request.url always contains initial url self.logger.debug("PE %s error: %s (seed: %s)", diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index f0cdb5aff..54cda77ce 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -131,7 +131,7 @@ def _on_links_extracted(self, request, links): def _on_request_error(self, request, error): logger.debug("Page error %s (%s)", request.url, error) self.states_context.states.set_states(request) - self.strategy.page_error(request, error) + self.strategy.request_error(request, error) self.states_context.states.update_cache(request) diff --git a/tests/backends.py b/tests/backends.py index 595012d71..253855695 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -44,7 +44,7 @@ def links_extracted(self, request, links): def page_crawled(self, response): response.meta[b'state'] = States.CRAWLED - def page_error(self, request, error): + def request_error(self, request, error): request.meta[b'state'] = States.ERROR diff --git a/tests/mocks/components.py b/tests/mocks/components.py index 2e69847a5..710a9f753 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -219,5 +219,5 @@ def links_extracted(self, request, links): link.meta[b'state'] = States.QUEUED self.schedule(link, 0.5) - def page_error(self, request, error): + def request_error(self, request, error): request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 2771eea1f..57140d143 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -14,7 +14,7 @@ def read_seeds(self, seeds_file): def page_crawled(self, response): pass - def page_error(self, request, error): + def request_error(self, request, error): pass def links_extracted(self, request, links): From a6d1c37351fe10a74993b337f2ee0f096f02144c Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 14:00:22 +0500 Subject: [PATCH 220/273] states tests extraction --- tests/contrib/backends/hbase/test_hbase.py | 8 ++++ tests/contrib/backends/test_backend.py | 44 ++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 tests/contrib/backends/test_backend.py diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 4b8400d41..67477c172 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -9,6 +9,7 @@ from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue +from tests.contrib.backends.test_backend import StatesTester from frontera.core.models import Request, Response from frontera.core.components import States from binascii import unhexlify @@ -76,6 +77,7 @@ def test_queue_with_delay(self): assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r5.url]) + @pytest.mark.skip def test_state(self): connection = Connection(host='hbase-docker', port=9090) state = HBaseState(connection, b'states', cache_size_limit=300000, @@ -125,3 +127,9 @@ def test_drop_all_tables_when_table_name_is_str(self): write_log_size=10, drop_all_tables=True) except AlreadyExists: assert False, "failed to drop hbase tables" + + +class TestHBaseStates(StatesTester): + + + def get_backend(self): diff --git a/tests/contrib/backends/test_backend.py b/tests/contrib/backends/test_backend.py new file mode 100644 index 000000000..a40fc462a --- /dev/null +++ b/tests/contrib/backends/test_backend.py @@ -0,0 +1,44 @@ +import six +from abc import ABCMeta, abstractmethod +from frontera.core.models import Request, Response +from frontera.core.components import States + + +r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', + b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) +r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', + b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) +r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', + b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r4 = r3.copy() + + +@six.add_metaclass(ABCMeta) +class StatesTester(object): + + @abstractmethod + def get_backend(self): + pass + + def test_states(self): + states = self.get_backend().states + states.set_states([r1, r2, r3]) + assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.CRAWLED + r2.meta[b'state'] = States.ERROR + r3.meta[b'state'] = States.QUEUED + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.NOT_CRAWLED + r2.meta[b'state'] = States.NOT_CRAWLED + r3.meta[b'state'] = States.NOT_CRAWLED + + states.fetch([b'83']) + states.set_states([r1, r2, r4]) + assert r4.meta[b'state'] == States.QUEUED + assert r1.meta[b'state'] == States.CRAWLED + assert r2.meta[b'state'] == States.ERROR \ No newline at end of file From 2c982d258cf7ebdb04fe5374da80d3b51543b906 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 14:32:48 +0500 Subject: [PATCH 221/273] new hbase states test --- tests/contrib/backends/hbase/test_hbase.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 67477c172..7a7d5e7ef 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -17,6 +17,7 @@ from w3lib.util import to_native_str from tests import mock import pytest +from unittest import TestCase r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) @@ -129,7 +130,15 @@ def test_drop_all_tables_when_table_name_is_str(self): assert False, "failed to drop hbase tables" -class TestHBaseStates(StatesTester): - +class TestHBaseStates(StatesTester, TestCase): + @classmethod + def setUpClass(cls): + s = cls() + s.connection = Connection(host='hbase-docker', port=9090) + s.states = HBaseState(s.connection, b'states', cache_size_limit=300000, + write_log_size=5000, drop_all_tables=True) + return s def get_backend(self): + return self.states + From 2dffcab9a430bf1461f24d2736b8327b22e193a3 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 14:56:22 +0500 Subject: [PATCH 222/273] fix --- tests/contrib/backends/hbase/test_hbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 7a7d5e7ef..41e1e94bd 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -140,5 +140,5 @@ def setUpClass(cls): return s def get_backend(self): - return self.states + return self From 17c448943303d1bc542d7c7e59fa1ab4937a0b68 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 19:39:11 +0500 Subject: [PATCH 223/273] moving states tester --- frontera/utils/tester.py | 50 +++++++++++++++++++++- tests/contrib/backends/hbase/test_hbase.py | 19 +++----- tests/contrib/backends/test_backend.py | 44 ------------------- 3 files changed, 55 insertions(+), 58 deletions(-) delete mode 100644 tests/contrib/backends/test_backend.py diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index b8f979ced..6874d273e 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -1,11 +1,15 @@ from __future__ import absolute_import, print_function from collections import OrderedDict, deque -from six.moves.urllib.parse import urlparse + import six -from six.moves import range +from abc import ABCMeta, abstractmethod +from frontera.core.components import States +from frontera.core.models import Request from io import BytesIO from os import linesep +from six.moves import range +from six.moves.urllib.parse import urlparse class FrontierTester(object): @@ -149,3 +153,45 @@ def downloader_info(self): def idle(self): return len(self.slots) == 0 + + + + +r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', + b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) +r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', + b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) +r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', + b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r4 = r3.copy() + + +@six.add_metaclass(ABCMeta) +class StatesTester(object): + + @abstractmethod + def get_backend(self): + pass + + def test_states(self): + states = self.get_backend().states + states.set_states([r1, r2, r3]) + assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.CRAWLED + r2.meta[b'state'] = States.ERROR + r3.meta[b'state'] = States.QUEUED + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.NOT_CRAWLED + r2.meta[b'state'] = States.NOT_CRAWLED + r3.meta[b'state'] = States.NOT_CRAWLED + + states.fetch([b'83']) + states.set_states([r1, r2, r4]) + assert r4.meta[b'state'] == States.QUEUED + assert r1.meta[b'state'] == States.CRAWLED + assert r2.meta[b'state'] == States.ERROR \ No newline at end of file diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 41e1e94bd..a6153962f 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -1,23 +1,18 @@ from __future__ import absolute_import -from time import sleep, time -from binascii import unhexlify - -from msgpack import unpackb -from happybase import Connection -from w3lib.util import to_native_str from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase +from binascii import unhexlify +from time import time +import pytest from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue -from tests.contrib.backends.test_backend import StatesTester -from frontera.core.models import Request, Response from frontera.core.components import States -from binascii import unhexlify -from time import time -from w3lib.util import to_native_str +from frontera.core.models import Request, Response +from frontera.utils.tester import StatesTester +from happybase import Connection from tests import mock -import pytest from unittest import TestCase +from w3lib.util import to_native_str r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) diff --git a/tests/contrib/backends/test_backend.py b/tests/contrib/backends/test_backend.py deleted file mode 100644 index a40fc462a..000000000 --- a/tests/contrib/backends/test_backend.py +++ /dev/null @@ -1,44 +0,0 @@ -import six -from abc import ABCMeta, abstractmethod -from frontera.core.models import Request, Response -from frontera.core.components import States - - -r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', - b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) -r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', - b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) -r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', - b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) -r4 = r3.copy() - - -@six.add_metaclass(ABCMeta) -class StatesTester(object): - - @abstractmethod - def get_backend(self): - pass - - def test_states(self): - states = self.get_backend().states - states.set_states([r1, r2, r3]) - assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 - states.update_cache([r1, r2, r3]) - states.flush() - - r1.meta[b'state'] = States.CRAWLED - r2.meta[b'state'] = States.ERROR - r3.meta[b'state'] = States.QUEUED - states.update_cache([r1, r2, r3]) - states.flush() - - r1.meta[b'state'] = States.NOT_CRAWLED - r2.meta[b'state'] = States.NOT_CRAWLED - r3.meta[b'state'] = States.NOT_CRAWLED - - states.fetch([b'83']) - states.set_states([r1, r2, r4]) - assert r4.meta[b'state'] == States.QUEUED - assert r1.meta[b'state'] == States.CRAWLED - assert r2.meta[b'state'] == States.ERROR \ No newline at end of file From 26eff5e3b86da8b44f385c8c419eda97d7d6fe36 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 19:43:27 +0500 Subject: [PATCH 224/273] style --- frontera/utils/tester.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 6874d273e..29b055826 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -155,8 +155,6 @@ def idle(self): return len(self.slots) == 0 - - r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', From ed5c4af28a4f51d9d80d11eaa8558bf577a8687a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 20:20:23 +0500 Subject: [PATCH 225/273] using fixtures --- frontera/utils/tester.py | 42 +------------------- tests/contrib/backends/hbase/test_hbase.py | 15 ------- tests/contrib/backends/test_backends.py | 46 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 56 deletions(-) create mode 100644 tests/contrib/backends/test_backends.py diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 29b055826..7dd107246 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -3,9 +3,7 @@ from collections import OrderedDict, deque import six -from abc import ABCMeta, abstractmethod -from frontera.core.components import States -from frontera.core.models import Request + from io import BytesIO from os import linesep from six.moves import range @@ -155,41 +153,3 @@ def idle(self): return len(self.slots) == 0 -r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', - b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) -r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', - b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) -r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', - b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) -r4 = r3.copy() - - -@six.add_metaclass(ABCMeta) -class StatesTester(object): - - @abstractmethod - def get_backend(self): - pass - - def test_states(self): - states = self.get_backend().states - states.set_states([r1, r2, r3]) - assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 - states.update_cache([r1, r2, r3]) - states.flush() - - r1.meta[b'state'] = States.CRAWLED - r2.meta[b'state'] = States.ERROR - r3.meta[b'state'] = States.QUEUED - states.update_cache([r1, r2, r3]) - states.flush() - - r1.meta[b'state'] = States.NOT_CRAWLED - r2.meta[b'state'] = States.NOT_CRAWLED - r3.meta[b'state'] = States.NOT_CRAWLED - - states.fetch([b'83']) - states.set_states([r1, r2, r4]) - assert r4.meta[b'state'] == States.QUEUED - assert r1.meta[b'state'] == States.CRAWLED - assert r2.meta[b'state'] == States.ERROR \ No newline at end of file diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index a6153962f..43858364f 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -8,10 +8,8 @@ from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue from frontera.core.components import States from frontera.core.models import Request, Response -from frontera.utils.tester import StatesTester from happybase import Connection from tests import mock -from unittest import TestCase from w3lib.util import to_native_str r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', @@ -124,16 +122,3 @@ def test_drop_all_tables_when_table_name_is_str(self): except AlreadyExists: assert False, "failed to drop hbase tables" - -class TestHBaseStates(StatesTester, TestCase): - @classmethod - def setUpClass(cls): - s = cls() - s.connection = Connection(host='hbase-docker', port=9090) - s.states = HBaseState(s.connection, b'states', cache_size_limit=300000, - write_log_size=5000, drop_all_tables=True) - return s - - def get_backend(self): - return self - diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py new file mode 100644 index 000000000..02f49831d --- /dev/null +++ b/tests/contrib/backends/test_backends.py @@ -0,0 +1,46 @@ +import pytest +from frontera.core.components import States +from frontera.core.models import Request +from happybase import Connection +from frontera.contrib.backends.hbase import HBaseState + + +r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', + b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) +r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', + b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) +r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', + b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r4 = r3.copy() + + +@pytest.fixture +def hbase_states(): + connection = Connection(host='hbase-docker', port=9090) + states = HBaseState(connection, b'states', cache_size_limit=300000, + write_log_size=5000, drop_all_tables=True) + return states + + +@pytest.mark.parametrize("states", [hbase_states()]) +def test_states(states): + states.set_states([r1, r2, r3]) + assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.CRAWLED + r2.meta[b'state'] = States.ERROR + r3.meta[b'state'] = States.QUEUED + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.NOT_CRAWLED + r2.meta[b'state'] = States.NOT_CRAWLED + r3.meta[b'state'] = States.NOT_CRAWLED + + states.fetch([b'83']) + states.set_states([r1, r2, r4]) + assert r4.meta[b'state'] == States.QUEUED + assert r1.meta[b'state'] == States.CRAWLED + assert r2.meta[b'state'] == States.ERROR \ No newline at end of file From a94ee7809386234beb63ec8a3b38922b5f2f7698 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 20:33:08 +0500 Subject: [PATCH 226/273] sqlalchemy states --- tests/contrib/backends/test_backends.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index 02f49831d..0e1d99e4c 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -3,6 +3,10 @@ from frontera.core.models import Request from happybase import Connection from frontera.contrib.backends.hbase import HBaseState +from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates +from frontera.contrib.backends.sqlalchemy.models import StateModel +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', @@ -22,7 +26,14 @@ def hbase_states(): return states -@pytest.mark.parametrize("states", [hbase_states()]) +def sqlalchemy_states(): + engine = create_engine('sqlite:///:memory:', echo=False) + session_cls = sessionmaker() + session_cls.configure(engine=engine) + return SQLAlchemyStates(session_cls, StateModel, 100) + + +@pytest.mark.parametrize("states", [hbase_states(), sqlalchemy_states()]) def test_states(states): states.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 From 5631001db3d6219e2b1ce787fbacbda6e9ad8799 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 20:37:38 +0500 Subject: [PATCH 227/273] test fix --- tests/contrib/backends/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index 0e1d99e4c..4e9fde0d3 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -29,7 +29,7 @@ def hbase_states(): def sqlalchemy_states(): engine = create_engine('sqlite:///:memory:', echo=False) session_cls = sessionmaker() - session_cls.configure(engine=engine) + session_cls.configure(bind=engine) return SQLAlchemyStates(session_cls, StateModel, 100) From 4205ea2d713c73ce24bd5432ff7c08e454de41c3 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Jul 2018 20:44:28 +0500 Subject: [PATCH 228/273] table creation --- tests/contrib/backends/test_backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index 4e9fde0d3..0af8ac4ad 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -30,6 +30,7 @@ def sqlalchemy_states(): engine = create_engine('sqlite:///:memory:', echo=False) session_cls = sessionmaker() session_cls.configure(bind=engine) + StateModel.__table__.create(bind=engine) return SQLAlchemyStates(session_cls, StateModel, 100) From b46600b93af0b394033e28675f96f83be76a5392 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 10:15:52 +0500 Subject: [PATCH 229/273] updated tests --- tests/contrib/backends/hbase/test_hbase.py | 40 ---------- tests/contrib/backends/test_backends.py | 90 ++++++++++++++++++---- 2 files changed, 73 insertions(+), 57 deletions(-) diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index 43858364f..88e440267 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -43,17 +43,6 @@ def test_metadata(self): set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12']) - def test_queue(self): - connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 2, b'queue', drop=True, use_snappy=False) - batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), - ('12', 0.7, r3, True)] - queue.schedule(batch) - assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r3.url]) - assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r1.url, r2.url]) - @pytest.mark.xfail def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) @@ -71,35 +60,6 @@ def test_queue_with_delay(self): assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r5.url]) - @pytest.mark.skip - def test_state(self): - connection = Connection(host='hbase-docker', port=9090) - state = HBaseState(connection, b'states', cache_size_limit=300000, - write_log_size=5000, drop_all_tables=True) - state.set_states([r1, r2, r3]) - assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 - state.update_cache([r1, r2, r3]) - assert dict(state._state_cache) == {b'10': States.NOT_CRAWLED, - b'11': States.NOT_CRAWLED, - b'12': States.NOT_CRAWLED} - assert state._state_batch._mutation_count == 3 - r1.meta[b'state'] = States.CRAWLED - r2.meta[b'state'] = States.CRAWLED - r3.meta[b'state'] = States.CRAWLED - state.update_cache([r1, r2, r3]) - assert state._state_batch._mutation_count == 6 - state.flush() - assert state._state_batch._mutation_count == 0 - state.fetch([b'10', b'11', b'12']) - assert dict(state._state_cache) == {b'10': States.CRAWLED, - b'11': States.CRAWLED, - b'12': States.CRAWLED} - r4.meta[b'state'] = States.ERROR - state.set_states([r1, r2, r4]) - assert r4.meta[b'state'] == States.CRAWLED - state.flush() - assert state._state_batch._mutation_count == 0 - def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index 0af8ac4ad..b6837e8f3 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -2,9 +2,10 @@ from frontera.core.components import States from frontera.core.models import Request from happybase import Connection -from frontera.contrib.backends.hbase import HBaseState -from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates -from frontera.contrib.backends.sqlalchemy.models import StateModel +from frontera.contrib.backends.hbase import HBaseState, HBaseQueue +from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates, Queue as SQLAlchemyQueue +from frontera.contrib.backends.sqlalchemy.models import StateModel, QueueModel +from frontera.contrib.backends.memory import MemoryStates, MemoryQueue from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker @@ -18,23 +19,42 @@ r4 = r3.copy() -@pytest.fixture -def hbase_states(): - connection = Connection(host='hbase-docker', port=9090) - states = HBaseState(connection, b'states', cache_size_limit=300000, - write_log_size=5000, drop_all_tables=True) - return states +hbase_connection = None +def get_hbase_connection(): + global hbase_connection + if hbase_connection is None: + hbase_connection = Connection(host='hbase-docker', port=9090) + return hbase_connection -def sqlalchemy_states(): - engine = create_engine('sqlite:///:memory:', echo=False) - session_cls = sessionmaker() - session_cls.configure(bind=engine) - StateModel.__table__.create(bind=engine) - return SQLAlchemyStates(session_cls, StateModel, 100) + +@pytest.fixture(scope="module", params=["memory", "sqlalchemy", "hbase"]) +def states(request): + if request.param == "memory": + ms = MemoryStates(100) + yield ms + return + + if request.param == "sqlalchemy": + engine = create_engine('sqlite:///:memory:', echo=False) + session_cls = sessionmaker() + session_cls.configure(bind=engine) + StateModel.__table__.create(bind=engine) + sqla_states = SQLAlchemyStates(session_cls, StateModel, 100) + yield sqla_states + sqla_states.frontier_stop() + engine.dispose() + return + + if request.param == "hbase": + conn = get_hbase_connection() + states = HBaseState(conn, b'states', cache_size_limit=300000, + write_log_size=5000, drop_all_tables=True) + yield states + states.frontier_stop() + raise KeyError("Unknown backend param") -@pytest.mark.parametrize("states", [hbase_states(), sqlalchemy_states()]) def test_states(states): states.set_states([r1, r2, r3]) assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 @@ -55,4 +75,40 @@ def test_states(states): states.set_states([r1, r2, r4]) assert r4.meta[b'state'] == States.QUEUED assert r1.meta[b'state'] == States.CRAWLED - assert r2.meta[b'state'] == States.ERROR \ No newline at end of file + assert r2.meta[b'state'] == States.ERROR + + +@pytest.fixture(scope="module", params=["memory", "sqlalchemy", "hbase"]) +def queue(request): + if request.param == "memory": + mq = MemoryQueue(2) + yield mq + return + + if request.param == "sqlalchemy": + engine = create_engine('sqlite:///:memory:', echo=False) + session_cls = sessionmaker() + session_cls.configure(bind=engine) + QueueModel.__table__.create(bind=engine) + sqla_queue = SQLAlchemyQueue(session_cls, QueueModel, 2) + yield sqla_queue + sqla_queue.frontier_stop() + engine.dispose() + return + + if request.param == "hbase": + conn = get_hbase_connection() + hq = HBaseQueue(conn, 2, b'queue') + yield hq + hq.frontier_stop() + raise KeyError("Unknown backend param") + + +def test_queue(queue): + batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), + ('12', 0.7, r3, True)] + queue.schedule(batch) + assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, + max_requests_per_host=10)]) == set([r3.url]) + assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, + max_requests_per_host=10)]) == set([r1.url, r2.url]) \ No newline at end of file From dffa5e705781c0c7f404e70b391ae2221cb717ec Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 10:16:50 +0500 Subject: [PATCH 230/273] style --- tests/contrib/backends/test_backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index b6837e8f3..5a59fb8e8 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -21,6 +21,7 @@ hbase_connection = None + def get_hbase_connection(): global hbase_connection if hbase_connection is None: From 19025cd8789d42f0d7e73986be278272dfbdc999 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 10:22:29 +0500 Subject: [PATCH 231/273] test fix --- tests/contrib/backends/test_backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py index 5a59fb8e8..dfdc3b798 100644 --- a/tests/contrib/backends/test_backends.py +++ b/tests/contrib/backends/test_backends.py @@ -53,6 +53,7 @@ def states(request): write_log_size=5000, drop_all_tables=True) yield states states.frontier_stop() + return raise KeyError("Unknown backend param") @@ -102,6 +103,7 @@ def queue(request): hq = HBaseQueue(conn, 2, b'queue') yield hq hq.frontier_stop() + return raise KeyError("Unknown backend param") From b31cbf1a3c4a227783e1db3e8dcd294f7356c665 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 10:48:54 +0500 Subject: [PATCH 232/273] frontera-scrapy test is enabled --- frontera/utils/add_seeds.py | 24 ++++++++++-------- tests/scrapy_spider/frontera/settings.py | 4 ++- tests/scrapy_spider/spiders/example.py | 1 - tests/test_scrapy_spider.py | 32 +++++++++++++++++++++--- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/frontera/utils/add_seeds.py b/frontera/utils/add_seeds.py index f884bd000..bbee17b11 100644 --- a/frontera/utils/add_seeds.py +++ b/frontera/utils/add_seeds.py @@ -11,6 +11,19 @@ logger = logging.getLogger(__name__) +def run_add_seeds(settings, seeds_file): + fh = open(seeds_file, "rb") + + logger.info("Starting local seeds addition from file %s", seeds_file) + + manager = LocalFrontierManager.from_settings(settings) + manager.add_seeds(fh) + manager.stop() + manager.close() + + logger.info("Seeds addition finished") + + if __name__ == '__main__': parser = ArgumentParser(description="Frontera local add seeds utility") parser.add_argument('--config', type=str, required=True, @@ -28,13 +41,4 @@ logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - fh = open(args.seeds_file, "rb") - - logger.info("Starting local seeds addition from file %s", args.seeds_file) - - manager = LocalFrontierManager.from_settings(settings) - manager.add_seeds(fh) - manager.stop() - manager.close() - - logger.info("Seeds addition finished") \ No newline at end of file + run_add_seeds(settings, args.seeds_file) \ No newline at end of file diff --git a/tests/scrapy_spider/frontera/settings.py b/tests/scrapy_spider/frontera/settings.py index fd2786d9c..4840327f6 100644 --- a/tests/scrapy_spider/frontera/settings.py +++ b/tests/scrapy_spider/frontera/settings.py @@ -1,7 +1,9 @@ #-------------------------------------------------------- # Frontier #-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///test.db' + MAX_REQUESTS = 5 MAX_NEXT_REQUESTS = 1 diff --git a/tests/scrapy_spider/spiders/example.py b/tests/scrapy_spider/spiders/example.py index 74d5d167e..36bdf2b4a 100644 --- a/tests/scrapy_spider/spiders/example.py +++ b/tests/scrapy_spider/spiders/example.py @@ -5,7 +5,6 @@ class MySpider(CrawlSpider): name = 'example' - start_urls = ['https://en.wikipedia.org/wiki/Main_Page'] callback_calls = 0 rules = [Rule(LinkExtractor(), diff --git a/tests/test_scrapy_spider.py b/tests/test_scrapy_spider.py index e41d09891..3470dc25e 100644 --- a/tests/test_scrapy_spider.py +++ b/tests/test_scrapy_spider.py @@ -3,13 +3,37 @@ from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import signals -from scrapy.settings import Settings +from scrapy.settings import Settings as ScrapySettings from tests.scrapy_spider.spiders.example import MySpider +from frontera.settings import Settings as FronteraSettings +from frontera.utils import add_seeds import pytest +from os import remove +from os.path import exists -@pytest.mark.skip(reason="no way of currently testing this") -def test_scrapy_spider(): - settings = Settings() + +@pytest.fixture() +def seeds_file(): + fh = open("seeds.txt", "w") + fh.write("https://en.wikipedia.org/wiki/Main_Page") + fh.close() + yield "seeds.txt" + remove("seeds.txt") + + +@pytest.fixture() +def db_file(request): + def rm_file(): + if exists("test.db"): + remove("test.db") + rm_file() + request.addfinalizer(rm_file) + + +def test_scrapy_spider(seeds_file, db_file): + fs = FronteraSettings(module="tests.scrapy_spider.frontera.settings") + add_seeds.run_add_seeds(fs, seeds_file) + settings = ScrapySettings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) From 95c34f26485b3018c166761616793eb3425cfeac Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 10:59:50 +0500 Subject: [PATCH 233/273] fix warning --- tests/test_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 57140d143..bc8a23430 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -7,7 +7,7 @@ from frontera.core.components import States -class TestingCrawlingStrategy(BaseCrawlingStrategy): +class DummyCrawlingStrategy(BaseCrawlingStrategy): def read_seeds(self, seeds_file): pass @@ -36,7 +36,7 @@ class TestCrawlingStrategy(object): def strategy(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' - settings.STRATEGY = 'tests.test_strategy.TestingCrawlingStrategy' + settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy' manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) From dbb51d70e54828860e9df48c70d0113b063fe405 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 11:42:37 +0500 Subject: [PATCH 234/273] skipping scrapy spider for now --- tests/test_scrapy_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scrapy_spider.py b/tests/test_scrapy_spider.py index 3470dc25e..d1e40a24d 100644 --- a/tests/test_scrapy_spider.py +++ b/tests/test_scrapy_spider.py @@ -29,7 +29,7 @@ def rm_file(): rm_file() request.addfinalizer(rm_file) - +@pytest.mark.skip("throws ReactorNotRestartable and requires some planning") def test_scrapy_spider(seeds_file, db_file): fs = FronteraSettings(module="tests.scrapy_spider.frontera.settings") add_seeds.run_add_seeds(fs, seeds_file) From 6ad1a4c3182fe7c66f0abc3649dbac6be69089d5 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 11:47:40 +0500 Subject: [PATCH 235/273] renaming modules to support py3.7 --- .../contrib/messagebus/kafka/{async.py => offsets_fetcher.py} | 0 frontera/contrib/messagebus/kafkabus.py | 2 +- frontera/utils/{async.py => twisted.py} | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename frontera/contrib/messagebus/kafka/{async.py => offsets_fetcher.py} (100%) rename frontera/utils/{async.py => twisted.py} (100%) diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/offsets_fetcher.py similarity index 100% rename from frontera/contrib/messagebus/kafka/async.py rename to frontera/contrib/messagebus/kafka/offsets_fetcher.py diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index bae555a6d..f4761e052 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -8,7 +8,7 @@ from kafka import KafkaConsumer, KafkaProducer, TopicPartition from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner -from frontera.contrib.messagebus.kafka.async import OffsetsFetcherAsync +from frontera.contrib.messagebus.kafka.offsets_fetcher import OffsetsFetcherAsync from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseSpiderFeedStream, \ BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer, BaseStatsLogStream from twisted.internet.task import LoopingCall diff --git a/frontera/utils/async.py b/frontera/utils/twisted.py similarity index 100% rename from frontera/utils/async.py rename to frontera/utils/twisted.py From 03a5f912ad3b56902614f3a8749b216bf974fedf Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 11:56:40 +0500 Subject: [PATCH 236/273] to the previous commit --- frontera/utils/{twisted.py => twisted_helpers.py} | 0 frontera/worker/components/__init__.py | 9 +++------ frontera/worker/server.py | 2 +- tests/test_utils_async.py | 2 +- 4 files changed, 5 insertions(+), 8 deletions(-) rename frontera/utils/{twisted.py => twisted_helpers.py} (100%) diff --git a/frontera/utils/twisted.py b/frontera/utils/twisted_helpers.py similarity index 100% rename from frontera/utils/twisted.py rename to frontera/utils/twisted_helpers.py diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py index f52db7bf7..74c6a8132 100644 --- a/frontera/worker/components/__init__.py +++ b/frontera/worker/components/__init__.py @@ -2,13 +2,10 @@ from __future__ import absolute_import import time -import logging -import threading - -from twisted.internet import reactor, task, threads -from frontera.exceptions import NotConfigured -from frontera.utils.async import CallLaterOnce +import logging +from frontera.utils.twisted_helpers import CallLaterOnce +from twisted.internet import reactor, threads class DBWorkerBaseComponent(object): diff --git a/frontera/worker/server.py b/frontera/worker/server.py index 74530c4b1..dd05b2e60 100644 --- a/frontera/worker/server.py +++ b/frontera/worker/server.py @@ -7,7 +7,7 @@ from twisted.web import server, resource -from frontera.utils.async import listen_tcp +from frontera.utils.twisted_helpers import listen_tcp logger = getLogger("cf-server") diff --git a/tests/test_utils_async.py b/tests/test_utils_async.py index bbf6d83fe..10c75326d 100644 --- a/tests/test_utils_async.py +++ b/tests/test_utils_async.py @@ -4,7 +4,7 @@ from twisted.test.proto_helpers import MemoryReactor from twisted.internet.protocol import Factory from twisted.internet.task import Clock -from frontera.utils.async import CallLaterOnce, listen_tcp +from frontera.utils.twisted_helpers import CallLaterOnce, listen_tcp class TestCallLaterOnce(object): From f91dd7afc028edad1bedfa3aa890f1217f28ddd4 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 12:13:08 +0500 Subject: [PATCH 237/273] updated with latest changes --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9c4f8ef68..7a8070d17 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,7 @@ ## Overview -Frontera is a web crawling framework consisting of [crawl frontier](http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html), -and distribution/scaling primitives, allowing to build a large scale online web crawler. +Frontera is a web crawling framework consisting of [crawl frontier](http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html), and distribution/scaling primitives, allowing to build a large scale online web crawler. Frontera takes care of the logic and policies to follow during the crawl. It stores and prioritises links extracted by the crawler to decide which pages to visit next, and capable of doing it in distributed manner. @@ -11,12 +10,13 @@ the crawler to decide which pages to visit next, and capable of doing it in dist ## Main features - Online operation: small requests batches, with parsing done right after fetch. -- Pluggable backend architecture: low-level storage logic is separated from crawling policy. -- Three run modes: single process, distributed spiders, distributed backend and spiders. +- Pluggable backend architecture: low-level backend access logic is separated from crawling strategy. +- Two run modes: single process and distributed. +- Built-in SqlAlchemy, Redis and HBase backends. +- Built-in Apache Kafka and ZeroMQ message buses. +- Built-in crawling strategies: breadth-first, depth-first, Discovery (with support of robots.txt and sitemaps). - Transparent data flow, allowing to integrate custom components easily using Kafka. - Message bus abstraction, providing a way to implement your own transport (ZeroMQ and Kafka are available out of the box). -- RDBMS and HBase backends. -- Revisiting logic with RDBMS. - Optional use of Scrapy for fetching and parsing. - 3-clause BSD license, allowing to use in any commercial product. - Python 3 support. From a26a0a91d9d30b50615e111b096689dbbdccbb93 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 12:18:35 +0500 Subject: [PATCH 238/273] version bump --- docs/source/conf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e720fc4b1..e31061e5b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -48,16 +48,16 @@ # General information about the project. project = u'Frontera' -copyright = u'2014-2016, Frontera authors' +copyright = u'2014-2018, Frontera authors' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.7' +version = '0.8' # The full version, including alpha/beta/rc tags. -release = '0.7.1' +release = '0.8.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From dbf15b071eb00869e7fd422e70702c554f20e880 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Wed, 25 Jul 2018 14:59:13 +0500 Subject: [PATCH 239/273] Update README.md battle tested sentence --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7a8070d17..c8df8bb56 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ the crawler to decide which pages to visit next, and capable of doing it in dist - Built-in SqlAlchemy, Redis and HBase backends. - Built-in Apache Kafka and ZeroMQ message buses. - Built-in crawling strategies: breadth-first, depth-first, Discovery (with support of robots.txt and sitemaps). +- Battle tested: our biggest deployment is 60 spiders/strategy workers delivering 50-60M of documents daily for 45 days, without downtime, - Transparent data flow, allowing to integrate custom components easily using Kafka. - Message bus abstraction, providing a way to implement your own transport (ZeroMQ and Kafka are available out of the box). - Optional use of Scrapy for fetching and parsing. From fb05073d78a8a635dd8c52a27b75f05e33b5d397 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 30 Jul 2018 15:58:18 +0200 Subject: [PATCH 240/273] some refactor + devnull producer for stats --- .../contrib/messagebus/zeromq/__init__.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index f54d64989..5f52295ec 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -8,7 +8,7 @@ import six from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseSpiderFeedStream, BaseScoringLogStream, BaseStatsLogStream + BaseSpiderFeedStream, BaseScoringLogStream, BaseStatsLogStream, BaseStreamProducer from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner from frontera.contrib.messagebus.zeromq.socket_config import SocketConfig from six.moves import range @@ -61,7 +61,7 @@ def get_offset(self, partition_id): return self.counter -class Producer(object): +class Producer(BaseStreamProducer): def __init__(self, context, location, identity): self.identity = identity self.sender = context.zeromq.socket(zmq.PUB) @@ -123,9 +123,9 @@ def consumer(self, partition_id, type): return Consumer(self.context, location, partition_id, b'sl') -class UpdateScoreProducer(Producer): - def __init__(self, context, location): - super(UpdateScoreProducer, self).__init__(context, location, b'us') +class NonPartitionedProducer(Producer): + def __init__(self, context, location, identity): + super(NonPartitionedProducer, self).__init__(context, location, identity) def send(self, key, *messages): # Guarantee that msg is actually a list or tuple (should always be true) @@ -155,7 +155,7 @@ def consumer(self): return Consumer(self.context, self.out_location, None, b'us') def producer(self): - return UpdateScoreProducer(self.context, self.in_location) + return NonPartitionedProducer(self.context, self.in_location, b'us') class SpiderFeedProducer(Producer): @@ -194,6 +194,17 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) +class DevNullProducer(BaseStreamProducer): + def send(self, key, *messages): + pass + + def flush(self): + pass + + def get_offset(self, partition_id): + pass + + class StatsLogStream(BaseStatsLogStream): def __init__(self, messagebus): self.context = messagebus.context @@ -203,7 +214,7 @@ def consumer(self): pass def producer(self): - return Producer(self.context, self.in_location, b'st') + return DevNullProducer() class Context(object): From 4a8f0c1fe5398aeaed71d1f05e1f40f5248c676a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 30 Jul 2018 15:59:11 +0200 Subject: [PATCH 241/273] docs + example config update --- .../source/topics/quick-start-distributed.rst | 38 +++++++++++-------- examples/general-spider/README.md | 2 - examples/general-spider/config/__init__.py | 6 +++ examples/general-spider/config/dbw.py | 7 ++++ .../{frontier => config}/single.py | 3 +- .../spider_settings.py => config/spider.py} | 5 +-- examples/general-spider/config/sw.py | 7 ++++ .../workersettings.py => config/worker.py} | 10 ++--- examples/general-spider/frontier/__init__.py | 1 - examples/general-spider/general/settings.py | 5 ++- examples/general-spider/logging.conf | 13 +++---- 11 files changed, 57 insertions(+), 40 deletions(-) delete mode 100644 examples/general-spider/README.md create mode 100644 examples/general-spider/config/__init__.py create mode 100644 examples/general-spider/config/dbw.py rename examples/general-spider/{frontier => config}/single.py (90%) rename examples/general-spider/{frontier/spider_settings.py => config/spider.py} (89%) create mode 100644 examples/general-spider/config/sw.py rename examples/general-spider/{frontier/workersettings.py => config/worker.py} (63%) delete mode 100644 examples/general-spider/frontier/__init__.py diff --git a/docs/source/topics/quick-start-distributed.rst b/docs/source/topics/quick-start-distributed.rst index db37edad1..9b5e0577a 100644 --- a/docs/source/topics/quick-start-distributed.rst +++ b/docs/source/topics/quick-start-distributed.rst @@ -4,14 +4,17 @@ Quick start distributed mode Here is a guide how to quickly setup Frontera for single-machine, multiple process, local hacking. We're going to deploy the simpliest possible setup with SQLite and ZeroMQ. Please proceed to :doc:`cluster-setup` article for a -production setup details for broad crawlers. +production setup details. + +Our crawler will have absolute minimum of components needed to work 1 :term:`spider`, 1 :term:`strategy worker` and +1 batch-gen, scoring worker. .. _basic_requirements: Prerequisites ============= -Here is what services needs to be installed and configured before running Frontera: +Here is what needs to be installed and configured before running Frontera: - Python 2.7+ or 3.4+ - Scrapy @@ -41,6 +44,9 @@ settings files, please consult :doc:`settings reference ` to Start cluster ============= + IMPORTANT! Because we're using ZeroMQ, and queue is stored in memory the order of the components starting is + important, please follow as described. + First, let's start ZeroMQ broker. :: $ python -m frontera.contrib.messagebus.zeromq.broker @@ -49,25 +55,25 @@ You should see a log output of broker with statistics on messages transmitted. All further commands have to be made from ``general-spider`` root directory. -Second, let's start DB worker. :: +Second, there are Spanish (.es zone) internet URLs from DMOZ directory in general spider repository, let's use them as +seeds to bootstrap crawling:: + + $ python -m frontera.utils.add_seeds --config config.dbw --seeds-file seeds_es_smp.txt - $ python -m frontera.worker.db --config frontier.workersettings +You should notice the log output and message saying that seeds addition is finished. +Third, starting the :term:`strategy worker`:: -You should notice that DB is writing messages to the output. It's ok if nothing is written in ZeroMQ sockets, because -of absence of seed URLs in the system. + $ python -m frontera.worker.strategy --config config.sw -There are Spanish (.es zone) internet URLs from DMOZ directory in general spider repository, let's use them as -seeds to bootstrap crawling. -Starting the spiders: :: +Fourth, starting the Scrapy spider:: - $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SEEDS_SOURCE=seeds_es_smp.txt -s SPIDER_PARTITION_ID=0 - $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SPIDER_PARTITION_ID=1 + $ python -m scrapy crawl general +Finally, the DB worker:: -You should end up with 2 spider processes running. Each should read it's own Frontera config, and first one is using -``SEEDS_SOURCE`` option to read seeds to bootstrap Frontera cluster. + $ python -m frontera.worker.db --no-incoming --config config.dbw --partitions 0 -After some time seeds will pass the streams and will be scheduled for downloading by workers. At this moment crawler -is bootstrapped. Now you can periodically check DB worker output or ``metadata`` table contents to see that there is -actual activity. +You should notice in logs that DB worker is trying to generate batches and after a short period the Scrapy is crawling +pages, also check the stats change in ZMQ broker and strategy worker. That's it, crawler is running with default +:term:`crawling strategy`. diff --git a/examples/general-spider/README.md b/examples/general-spider/README.md deleted file mode 100644 index bb1cb17d9..000000000 --- a/examples/general-spider/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# general-spider -A general spider with links extraction for Frontera diff --git a/examples/general-spider/config/__init__.py b/examples/general-spider/config/__init__.py new file mode 100644 index 000000000..01aa296ae --- /dev/null +++ b/examples/general-spider/config/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +SPIDER_FEED_PARTITIONS = 1 +SPIDER_LOG_PARTITIONS = 1 +LOCAL_MODE=False \ No newline at end of file diff --git a/examples/general-spider/config/dbw.py b/examples/general-spider/config/dbw.py new file mode 100644 index 000000000..4f48a6fe4 --- /dev/null +++ b/examples/general-spider/config/dbw.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from config.worker import * + +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///queue.sqlite' + + + diff --git a/examples/general-spider/frontier/single.py b/examples/general-spider/config/single.py similarity index 90% rename from examples/general-spider/frontier/single.py rename to examples/general-spider/config/single.py index f86c135bd..c7208a706 100644 --- a/examples/general-spider/frontier/single.py +++ b/examples/general-spider/config/single.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- -import logging +from __future__ import absolute_import + BACKEND = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage.sqlite' diff --git a/examples/general-spider/frontier/spider_settings.py b/examples/general-spider/config/spider.py similarity index 89% rename from examples/general-spider/frontier/spider_settings.py rename to examples/general-spider/config/spider.py index 2e35ff6cb..8f6d0a816 100644 --- a/examples/general-spider/frontier/spider_settings.py +++ b/examples/general-spider/config/spider.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from frontera.settings.default_settings import MIDDLEWARES +from config import * MAX_NEXT_REQUESTS = 256 DELAY_ON_EMPTY = 5.0 @@ -13,6 +15,3 @@ # Crawl frontier backend #-------------------------------------------------------- BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' -SPIDER_FEED_PARTITIONS = 2 - - diff --git a/examples/general-spider/config/sw.py b/examples/general-spider/config/sw.py new file mode 100644 index 000000000..06afb1808 --- /dev/null +++ b/examples/general-spider/config/sw.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from config.worker import * + +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///strategy.sqlite' + + + diff --git a/examples/general-spider/frontier/workersettings.py b/examples/general-spider/config/worker.py similarity index 63% rename from examples/general-spider/frontier/workersettings.py rename to examples/general-spider/config/worker.py index fa0e59adf..fe850d316 100644 --- a/examples/general-spider/frontier/workersettings.py +++ b/examples/general-spider/config/worker.py @@ -1,22 +1,18 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from frontera.settings.default_settings import MIDDLEWARES +from config import * MAX_NEXT_REQUESTS = 512 -SPIDER_FEED_PARTITIONS = 2 -SPIDER_LOG_PARTITIONS = 1 #-------------------------------------------------------- # Url storage #-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.sqlalchemy.SQLAlchemyBackend' -#BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' +BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' -SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage_dist.sqlite' SQLALCHEMYBACKEND_ENGINE_ECHO = False -SQLALCHEMYBACKEND_DROP_ALL_TABLES = True -SQLALCHEMYBACKEND_CLEAR_CONTENT = True from datetime import timedelta SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=3) diff --git a/examples/general-spider/frontier/__init__.py b/examples/general-spider/frontier/__init__.py deleted file mode 100644 index 7c68785e9..000000000 --- a/examples/general-spider/frontier/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/examples/general-spider/general/settings.py b/examples/general-spider/general/settings.py index e29b425e7..54d192353 100644 --- a/examples/general-spider/general/settings.py +++ b/examples/general-spider/general/settings.py @@ -5,10 +5,9 @@ NEWSPIDER_MODULE = 'general.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'topic (+http://www.yourdomain.com)' +USER_AGENT = 'Frontera-based example bot (+https://github.com/scrapinghub/frontera)' SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, 'scrapy.spidermiddleware.depth.DepthMiddleware': None, 'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None, @@ -46,3 +45,5 @@ REACTOR_THREADPOOL_MAXSIZE = 32 DNS_TIMEOUT = 180 +FRONTERA_SETTINGS = 'config.spider' +HTTPERROR_ALLOW_ALL = True \ No newline at end of file diff --git a/examples/general-spider/logging.conf b/examples/general-spider/logging.conf index 9fd40f7eb..cab082e0c 100644 --- a/examples/general-spider/logging.conf +++ b/examples/general-spider/logging.conf @@ -5,20 +5,20 @@ keys=root,dbw,sw keys=hand01 [formatters] -keys=form01,form02 +keys=form01 [logger_root] -level=DEBUG +level=INFO handlers=hand01 [logger_dbw] -level=DEBUG +level=INFO handlers=hand01 qualname=db-worker propagate=0 [logger_sw] -level=DEBUG +level=INFO handlers=hand01 qualname=strategy-worker propagate=0 @@ -27,11 +27,8 @@ propagate=0 class=StreamHandler level=NOTSET args=(sys.stdout,) -formatter=form02 +formatter=form01 [formatter_form01] format=%(asctime)s %(levelname)-8s %(name)-15s %(message)s -[formatter_form02] -format=%(log_color)s %(asctime)s %(levelname)-8s %(name)-15s %(message)s -class=colorlog.ColoredFormatter From 51526c1990ca9c974f442c422826396e66922f5f Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 30 Jul 2018 16:37:14 +0200 Subject: [PATCH 242/273] local_mode option --- docs/source/topics/cluster-setup.rst | 26 +++++++++------- .../topics/custom_crawling_strategy.rst | 30 ++++++++++--------- docs/source/topics/quick-start-single.rst | 2 -- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/docs/source/topics/cluster-setup.rst b/docs/source/topics/cluster-setup.rst index f5c84a15f..d1bacb021 100644 --- a/docs/source/topics/cluster-setup.rst +++ b/docs/source/topics/cluster-setup.rst @@ -43,7 +43,7 @@ Configuring HBase Configuring Frontera ==================== -Every Frontera component requires it's own configuration module, but some options are shared, so we recommend to create +Every Frontera component requires its own configuration module, but some options are shared, so we recommend to create a common modules and import settings from it in component's modules. 1. Create a common module and add there: :: @@ -101,6 +101,7 @@ The logging can be configured according to https://docs.python.org/2/library/log BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' KAFKA_GET_TIMEOUT = 0.5 + LOCAL_MODE = False # by default Frontera is prepared for single process mode 6. Configure Scrapy settings module. It's located in Scrapy project folder and referenced in scrapy.cfg. Let's add @@ -125,9 +126,11 @@ Starting the cluster First, let's start storage worker: :: # start DB worker only for batch generation - $ python -m frontera.worker.db --config [db worker config module] --no-incoming - ... - # Then start next one dedicated to spider log processing + # use single instance for every 10 partitions + $ python -m frontera.worker.db --config [db worker config module] --no-incoming --partitions 0,1 + + + # Optionally, start next one dedicated to spider log processing. $ python -m frontera.worker.db --no-batches --config [db worker config module] @@ -141,17 +144,18 @@ Next, let's start strategy workers, one process per spider log partition: :: You should notice that all processes are writing messages to the log. It's ok if nothing is written in streams, because of absence of seed URLs in the system. -Let's put our seeds in text file, one URL per line and start spiders. A single spider per spider feed partition: :: +Let's put our seeds in text file, one URL per line and run:: + + $ python -m frontera.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] + +Finally, a single spider per spider feed partition: :: - $ scrapy crawl [spider] -L INFO -s SEEDS_SOURCE = 'seeds.txt' -s SPIDER_PARTITION_ID=0 - ... $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=1 $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=2 ... $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=N -You should end up with N spider processes running. Usually it's enough for a single instance to read seeds from -``SEEDS_SOURCE`` variable to pass seeds to Frontera cluster. Seeds are only read if spider queue is empty. -::setting:`SPIDER_PARTITION_ID` can be read from config file also. +You should end up with N spider processes running. Also :setting:`SPIDER_PARTITION_ID` can be read from config file. -After some time seeds will pass the streams and will be scheduled for downloading by workers. Crawler is bootstrapped. +You're done, crawler should start crawling. Any component can be restarted any time, without major data loss. However, +for pausing its enough to stop batch gen only. \ No newline at end of file diff --git a/docs/source/topics/custom_crawling_strategy.rst b/docs/source/topics/custom_crawling_strategy.rst index aa5038a41..949925065 100644 --- a/docs/source/topics/custom_crawling_strategy.rst +++ b/docs/source/topics/custom_crawling_strategy.rst @@ -44,7 +44,7 @@ It has to be inherited from BaseCrawlingStrategy and implement it's API. .. automethod:: frontera.strategy.BaseCrawlingStrategy.page_crawled .. automethod:: frontera.strategy.BaseCrawlingStrategy.filter_extracted_links .. automethod:: frontera.strategy.BaseCrawlingStrategy.links_extracted - .. automethod:: frontera.strategy.BaseCrawlingStrategy.page_error + .. automethod:: frontera.strategy.BaseCrawlingStrategy.request_error .. automethod:: frontera.strategy.BaseCrawlingStrategy.finished .. automethod:: frontera.strategy.BaseCrawlingStrategy.close .. automethod:: frontera.strategy.BaseCrawlingStrategy.schedule @@ -90,12 +90,13 @@ Main This is the main cycle used when crawl is in progress. In a nutshell on every spider event the specific handler is called, depending on the type of event. When strategy worker is getting the SIGTERM signal it's trying to stop politely - by calling close(). In its normal state it listens for a spider log and executes the event handlers. +by calling close(). In its normal state it listens for a spider log and executes the event handlers. -1. from_worker() → init() -1. page_crawled(response) OR page_error(request, error) OR filter_extracted_links(request, links) and subsequent links_extracted(request, links) -1. close() -1. exit +1. `from_worker()` → init() +2. `page_crawled(response)` OR `page_error(request, error)` OR `filter_extracted_links(request, links)` and subsequent + `links_extracted(request, links)` +3. `close()` +4. exit Scheduling and creating requests ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -103,7 +104,7 @@ Scheduling and creating requests The ultimate goal of crawling strategy is scheduling of requests. To schedule request there is a method schedule(request, score). The request is an instance of :class:`Request ` class and is often available from arguments of event handlers: _page_crawled_, _page_error_ and _links_extracted_, or can be created - on-demand using _create_request()_ method. +on-demand using :attr:`create_request ` method. IMPORTANT NOTICE @@ -181,13 +182,14 @@ log output of link states changes (experimental SW feature). Meta fields ----------- -== ======= =========== ========= -# name description presence -== ======= =========== ========= +== ============== =================================================================================================================================================== ========= +# name description presence +== ============== =================================================================================================================================================== ========= 1 b"slot" Queue partitioning key in bytes, highest priority. Use it if your app requires partitioning other than default 2-nd level domain-based partitioning Optional -2 b"domain" Dict generated by Frontera DomainMiddleware, and containing parsed domain name Always -3 b"state" Integer representing the link state, set by strategy worker. Link states are defined in frontera.core.components.States Always -4 b"encoding" In response, for HTML, encoding detected by Scrapy Optional -5 b"scrapy_meta" When scheduling can be used to set meta field for Scrapy Optional +2 b"domain" Dict generated by Frontera DomainMiddleware, and containing parsed domain name Always +3 b"state" Integer representing the link state, set by strategy worker. Link states are defined in frontera.core.components.States Always +4 b"encoding" In response, for HTML, encoding detected by Scrapy Optional +5 b"scrapy_meta" When scheduling can be used to set meta field for Scrapy Optional +== ============== =================================================================================================================================================== ========= Keys and string types in nested structures are always bytes. diff --git a/docs/source/topics/quick-start-single.rst b/docs/source/topics/quick-start-single.rst index f72f3dd95..7a2f58b33 100644 --- a/docs/source/topics/quick-start-single.rst +++ b/docs/source/topics/quick-start-single.rst @@ -58,8 +58,6 @@ distributed mode:: SPIDER_FEED_PARTITIONS = 1 SPIDER_LOG_PARTITIONS = 1 - STRATEGY = "frontera.strategy.basic.BasicCrawlingStrategy" - 5. Choose your backend ====================== From 7b6717dfa7aead27485e4dd38c8836c652f92d9a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 30 Jul 2018 16:47:18 +0200 Subject: [PATCH 243/273] version bump --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e31061e5b..9f4305251 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,7 +57,7 @@ # The short X.Y version. version = '0.8' # The full version, including alpha/beta/rc tags. -release = '0.8.0' +release = '0.8.0.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From fff3617662e2efc93ded0272893ea58277a35542 Mon Sep 17 00:00:00 2001 From: Ignacio Capurro Date: Tue, 21 Aug 2018 00:08:27 -0300 Subject: [PATCH 244/273] Remove invalid keyword argument 'sleep' from kafka pull --- frontera/contrib/messagebus/kafka/offsets_fetcher.py | 2 +- .../contrib/messagebus/kafka/test_offset_fetcher.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 tests/contrib/messagebus/kafka/test_offset_fetcher.py diff --git a/frontera/contrib/messagebus/kafka/offsets_fetcher.py b/frontera/contrib/messagebus/kafka/offsets_fetcher.py index 97315341a..15630e290 100644 --- a/frontera/contrib/messagebus/kafka/offsets_fetcher.py +++ b/frontera/contrib/messagebus/kafka/offsets_fetcher.py @@ -178,7 +178,7 @@ def offsets(self, partitions, timestamp): if future.exception.invalid_metadata: refresh_future = self._client.cluster.request_update() - self._client.poll(future=refresh_future, sleep=True) + self._client.poll(future=refresh_future) log.warning("Got exception %s and kept the loop", future.exception) if offsets: return offsets diff --git a/tests/contrib/messagebus/kafka/test_offset_fetcher.py b/tests/contrib/messagebus/kafka/test_offset_fetcher.py new file mode 100644 index 000000000..c578d012a --- /dev/null +++ b/tests/contrib/messagebus/kafka/test_offset_fetcher.py @@ -0,0 +1,12 @@ +from mock import Mock, patch +from frontera.contrib.messagebus.kafka.offsets_fetcher import OffsetsFetcherAsync + + +class TestOffsetsFetcherAsync(object): + + def test_offsets_invalid_metadata(self): + fetcher = OffsetsFetcherAsync(group_id='test', topic='test') + future = Mock(succeeded=lambda: False, retriable=lambda: True, exceptions=Mock(invalid_metadata=True)) + with patch.object(fetcher, '_send_offset_request', return_value=[future]) as _send_offset_request: + fetcher.offsets([0], -1) + assert _send_offset_request.was_called() From 4fb16c9195d9d9431dbdbac77fc42131fb1a2f9c Mon Sep 17 00:00:00 2001 From: Ignacio Capurro Date: Tue, 28 Aug 2018 12:29:10 -0300 Subject: [PATCH 245/273] Removed offset unit test --- .../contrib/messagebus/kafka/test_offset_fetcher.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/contrib/messagebus/kafka/test_offset_fetcher.py diff --git a/tests/contrib/messagebus/kafka/test_offset_fetcher.py b/tests/contrib/messagebus/kafka/test_offset_fetcher.py deleted file mode 100644 index c578d012a..000000000 --- a/tests/contrib/messagebus/kafka/test_offset_fetcher.py +++ /dev/null @@ -1,12 +0,0 @@ -from mock import Mock, patch -from frontera.contrib.messagebus.kafka.offsets_fetcher import OffsetsFetcherAsync - - -class TestOffsetsFetcherAsync(object): - - def test_offsets_invalid_metadata(self): - fetcher = OffsetsFetcherAsync(group_id='test', topic='test') - future = Mock(succeeded=lambda: False, retriable=lambda: True, exceptions=Mock(invalid_metadata=True)) - with patch.object(fetcher, '_send_offset_request', return_value=[future]) as _send_offset_request: - fetcher.offsets([0], -1) - assert _send_offset_request.was_called() From bd3211bd2b0cbb7074369f03ed01e7efbfa33684 Mon Sep 17 00:00:00 2001 From: Victor Costa Date: Tue, 11 Sep 2018 10:32:35 +0100 Subject: [PATCH 246/273] Fix the redis backend The method flush in RedisState has to define a default value for force_clear to avoid problems with the frontier manager. --- frontera/contrib/backends/redis_backend/__init__.py | 2 +- frontera/settings/default_settings.py | 1 + tests/contrib/backends/redis_backend/test_redis.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index dcadbdb31..31869a161 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -258,7 +258,7 @@ def get(obj): [get(obj) for obj in objs] - def flush(self, force_clear): + def flush(self, force_clear=False): if len(self._cache) > self._cache_size_limit: force_clear = True [self._redis_pipeline.hmset(fprint, {FIELD_STATE: state}) for (fprint, state) in self._cache.items()] diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index fdf5d2aad..1685ace2b 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -49,6 +49,7 @@ REDIS_BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' REDIS_HOST = 'localhost' REDIS_PORT = 6379 +REDIS_STATE_CACHE_SIZE_LIMIT = 0 REQUEST_MODEL = 'frontera.core.models.Request' RESPONSE_MODEL = 'frontera.core.models.Response' diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py index 18fb897af..c924bd495 100644 --- a/tests/contrib/backends/redis_backend/test_redis.py +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -501,6 +501,13 @@ def test_get_next_request_has_requests(self): requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) self.assertEqual(3, len(requests)) + def test_close_manager(self): + settings = Settings(module='frontera.settings.default_settings') + settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend') + manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) + self.assertEqual(RedisBackend, manager.backend.__class__) + manager.close() + if __name__ == '__main__': main() From 0c8c58b6c54638a42dd24d6949ab2b27fb3a5afb Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 2 Nov 2018 17:00:43 +0100 Subject: [PATCH 247/273] scrapy role, overview update --- docs/source/topics/overview.rst | 29 +-------- docs/source/topics/scrapy-integration.rst | 76 +++++++++++++++++++---- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index 619f3964a..1db9ea99e 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -28,7 +28,7 @@ Here are few cases, external crawl frontier can be suitable for: * URL ordering/queueing isolation from the spider (e.g. distributed cluster of spiders, need of remote management of ordering/queueing), -* URL (meta)data storage is needed (e.g. to demonstrate it's contents somewhere), +* URL (meta)data storage is needed (e.g. to be able to pause and resume the crawl), * advanced URL ordering logic is needed, when it's hard to maintain code within spider/fetcher. @@ -48,31 +48,8 @@ If website is big, and it's expensive to crawl the whole website, Frontera can b the most important documents. -Distributed load, few websites -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If website needs to be crawled faster than single spider one could use distributed spiders mode. In this mode Frontera -is distributing spider processes and using one instance of backend worker. Requests are distributed using -:term:`message bus` of your choice and distribution logic can be adjusted using custom partitioning. By default requests -are distributed to spiders randomly, and desired request rate can be set in spiders. - -Consider also using proxy services, such as `Crawlera`_. - - -Revisiting -^^^^^^^^^^ - -There is a set of websites and one need to re-crawl them on timely (or other) manner. Frontera provides simple -revisiting backend, scheduling already visited documents for next visit using time interval set by option. This -backend is using general relational database for persistence and can be used in single process or distributed -spiders modes. - -Watchdog use case - when one needs to be notified about document changes, also could be addressed with such a backend -and minor customization. - - -Broad crawling -^^^^^^^^^^^^^^ +Broad crawling of many websites +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This use case requires full distribution: spiders and backend. In addition to spiders process one should be running :term:`strategy worker` (s) and :term:`db worker` (s), depending on chosen partitioning scheme. diff --git a/docs/source/topics/scrapy-integration.rst b/docs/source/topics/scrapy-integration.rst index ff3a8bee6..7069667e7 100644 --- a/docs/source/topics/scrapy-integration.rst +++ b/docs/source/topics/scrapy-integration.rst @@ -6,6 +6,20 @@ To use Frontera with Scrapy, you will need to add `Scrapy middlewares`_ and rede custom Frontera scheduler. Both can be done by modifying `Scrapy settings`_. +The purpose +=========== + +Scrapy is expected to be used as a fetching, HTML parsing and links extracting component. Your spider code have + to produce responses and requests from extracted links. That's all. Frontera's business is to keep the links, queue +and schedule links when needed. + +Please make sure all the middlewares affecting the crawling, like DepthMiddleware, OffsiteMiddleware or +RobotsTxtMiddleware are disabled. + +All other use cases when Scrapy is busy items generation, scraping from HTML, scheduling links directly trying to bypass +Frontera, are doomed to cause countless hours of maintenance. Please don't use Frontera integrated with Scrapy that way. + + Activating the frontier ======================= @@ -98,25 +112,65 @@ Writing Scrapy spider Spider logic ------------ -Creation of basic Scrapy spider is described at `Quick start single process`_ page. -It's also a good practice to prevent spider from closing because of insufficiency of queued requests transport::: +Creation of new Scrapy project is described at `Quick start single process`_ page. Again, your spider code have + to produce responses and requests from extracted links. Also, make sure exceptions caused by request processing are +not intercepted by any of the middlewares. Otherwise errors delivery to :term:`crawling strategy` will be broken. + +Here is an example code to start:: - @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - spider = cls(*args, **kwargs) - spider._set_crawler(crawler) - spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) - return spider + from scrapy import Spider + from scrapy.linkextractors import LinkExtractor + from scrapy.http import Request + from scrapy.http.response.html import HtmlResponse + + class CommonPageSpider(Spider): + + name = "commonpage" + + def __init__(self, *args, **kwargs): + super(CommonPageSpider, self).__init__(*args, **kwargs) + self.le = LinkExtractor() + + def parse(self, response): + if not isinstance(response, HtmlResponse): + return + for link in self.le.extract_links(response): + r = Request(url=link.url) + r.meta.update(link_text=link.text) + yield r - def spider_idle(self): - self.log("Spider idle signal caught.") - raise DontCloseSpider Configuration guidelines ------------------------ +Please specify a correct user agent string to disclose yourself to webmasters:: + + USER_AGENT = 'Some-Bot (+http://url/to-the-page-describing-the-purpose-of-crawling)' + + +When using Frontera robots.txt obeying have to be implemented in :term:`crawling strategy`:: + + ROBOTSTXT_OBEY = False + +Disable some of the spider and downloader middlewares which may affect the crawling:: + + SPIDER_MIDDLEWARES.update({ + 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, + 'scrapy.spidermiddlewares.referer.RefererMiddleware': None, + 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None, + 'scrapy.spidermiddlewares.depth.DepthMiddleware': None, + 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': None + }) + + DOWNLOADER_MIDDLEWARES.update({ + 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': None, + }) + + del DOWNLOADER_MIDDLEWARES_BASE['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] + + There several tunings you can make for efficient broad crawling. Various settings suitable for broad crawling:: From 037c3b4db1dad2f9e84216e8d4ad5f39701d5290 Mon Sep 17 00:00:00 2001 From: Guillermo Aguirre Date: Wed, 14 Nov 2018 14:36:10 -0300 Subject: [PATCH 248/273] Update cluster-setup docs --- docs/source/topics/cluster-setup.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/topics/cluster-setup.rst b/docs/source/topics/cluster-setup.rst index d1bacb021..281c95690 100644 --- a/docs/source/topics/cluster-setup.rst +++ b/docs/source/topics/cluster-setup.rst @@ -88,7 +88,7 @@ a common modules and import settings from it in component's modules. from __future__ import absolute_import from .worker import * - CRAWLING_STRATEGY = '' # path to the crawling strategy class + STRATEGY = '' # path to the crawling strategy class LOGGING_CONFIG='logging-sw.conf' # if needed The logging can be configured according to https://docs.python.org/2/library/logging.config.html see the @@ -127,7 +127,7 @@ First, let's start storage worker: :: # start DB worker only for batch generation # use single instance for every 10 partitions - $ python -m frontera.worker.db --config [db worker config module] --no-incoming --partitions 0,1 + $ python -m frontera.worker.db --config [db worker config module] --no-incoming --partitions 0 1 # Optionally, start next one dedicated to spider log processing. @@ -158,4 +158,4 @@ Finally, a single spider per spider feed partition: :: You should end up with N spider processes running. Also :setting:`SPIDER_PARTITION_ID` can be read from config file. You're done, crawler should start crawling. Any component can be restarted any time, without major data loss. However, -for pausing its enough to stop batch gen only. \ No newline at end of file +for pausing its enough to stop batch gen only. From f5ef1091e4952e938042821f34b60c0f0b0267f1 Mon Sep 17 00:00:00 2001 From: IAlwaysBeCoding Date: Wed, 28 Nov 2018 23:03:45 -0800 Subject: [PATCH 249/273] Docs: BaseCrawlingStrategy._refresh_states() also takes a non iterable for the requests parameter. It is important to disclose that `_refresh_states()` also takes a non iterable for the requests parameter. Eventually, `refresh_and_keep()` converts the non iterable object into a non-iterable object contained within a list. --- frontera/strategy/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/frontera/strategy/__init__.py b/frontera/strategy/__init__.py index 8fe6f223c..1248b2822 100644 --- a/frontera/strategy/__init__.py +++ b/frontera/strategy/__init__.py @@ -138,9 +138,10 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No def refresh_states(self, requests): """ - Retrieves states for all requests from storage. - - :param requests: list(:class:`Request `) + Retrieves states for all requests from storage. Converts requests to a 1 item list() of :class:`Request ` + if requests is not a list of :class:`Request ` objects. + + :param requests: list(:class:`Request `) or a single :class:`Request ` """ self._states_context.refresh_and_keep(requests) @@ -148,4 +149,4 @@ def frontier_start(self): pass def frontier_stop(self): - pass \ No newline at end of file + pass From 0c81482557e1f8078cd74de67778126ba2ec1713 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 30 Nov 2018 12:25:22 -0800 Subject: [PATCH 250/273] Update frontera/strategy/__init__.py Okay that makes sense. Co-Authored-By: IAlwaysBeCoding --- frontera/strategy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/strategy/__init__.py b/frontera/strategy/__init__.py index 1248b2822..2c9c793a2 100644 --- a/frontera/strategy/__init__.py +++ b/frontera/strategy/__init__.py @@ -138,7 +138,7 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No def refresh_states(self, requests): """ - Retrieves states for all requests from storage. Converts requests to a 1 item list() of :class:`Request ` + Retrieves states for all requests from storage. ` if requests is not a list of :class:`Request ` objects. :param requests: list(:class:`Request `) or a single :class:`Request ` From dfdc396fbd3c73a0bbf7ebc169bf21ab3ce3e3c2 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Mon, 14 Jan 2019 11:58:38 +0100 Subject: [PATCH 251/273] removing call to absent API in happybase --- frontera/contrib/backends/hbase/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index 0c2a08e05..13df3743a 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -599,13 +599,7 @@ def get_next_requests(self, max_next_requests, **kwargs): return results def get_stats(self): - """Helper to get stats dictionary for the backend. - - For now it provides only HBase client stats. - """ stats = {} - with time_elapsed('Call HBase backend get_stats()'): - stats.update(self.connection.client.get_stats()) if self._states: stats.update(self._states.get_stats()) return stats From 424f569eedf5b487fb73a749825ef404dc2a9745 Mon Sep 17 00:00:00 2001 From: "Knut O. Hellan" Date: Mon, 21 Jan 2019 10:45:23 +0100 Subject: [PATCH 252/273] Update after redis update Use dict of scheduled items rather than a list after redis library update. --- frontera/contrib/backends/redis_backend/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frontera/contrib/backends/redis_backend/__init__.py b/frontera/contrib/backends/redis_backend/__init__.py index 31869a161..2f4d37933 100644 --- a/frontera/contrib/backends/redis_backend/__init__.py +++ b/frontera/contrib/backends/redis_backend/__init__.py @@ -218,10 +218,10 @@ def _schedule(self, batch, timestamp): raise TypeError("domain of unknown type.") item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) interval_start = self.get_interval_start(score) - data.setdefault(partition_id, []).extend([int(interval_start * 100), packb(item)]) - for (key, items) in data.items(): - self._redis_pipeline.zadd(key, *items), data.items() - self._redis_pipeline.execute() + data.setdefault(partition_id, {})[packb(item)] = int(interval_start * 100) + for (key, items) in data.items(): + self._redis_pipeline.zadd(key, mapping=items) + self._redis_pipeline.execute() def count(self): return sum([self._redis.zcard(partition_id) for partition_id in self._partitions]) From e41880779e073b0471828cb61a6cfa130e51433a Mon Sep 17 00:00:00 2001 From: sibiryakov Date: Wed, 3 Apr 2019 17:17:44 +0200 Subject: [PATCH 253/273] fixed DomainCache crash on init --- frontera/contrib/backends/hbase/__init__.py | 2 +- frontera/contrib/backends/hbase/domaincache.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/__init__.py b/frontera/contrib/backends/hbase/__init__.py index 13df3743a..11db9d889 100644 --- a/frontera/contrib/backends/hbase/__init__.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -561,7 +561,7 @@ def domain_metadata(self): def frontier_start(self): for component in [self.metadata, self.queue, self.states, self.domain_metadata]: - if component: + if component is not None: component.frontier_start() def frontier_stop(self): diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index e88208ac5..6084ef1a5 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -180,6 +180,9 @@ def __missing__(self, key): __len__ = None + def __bool__(self): + return True + clear = None maxsize = None From 45576af8b66eed5c2e0afa28b6bfab9639e4ad22 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 5 Apr 2019 18:17:08 +0200 Subject: [PATCH 254/273] fixing Cache.__init__ signature changes --- frontera/contrib/backends/hbase/domaincache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index 6084ef1a5..d9f5681b1 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -23,7 +23,7 @@ class LRUCache(Cache): """Least Recently Used (LRU) cache implementation.""" def __init__(self, maxsize, missing=None, getsizeof=None): - Cache.__init__(self, maxsize, missing, getsizeof) + Cache.__init__(self, maxsize, missing=missing, getsizeof=getsizeof) self.__order = collections.OrderedDict() def __getitem__(self, key, cache_getitem=Cache.__getitem__): From 22d3bc52766ba3de5dd9abc09d9f27f93ddcf201 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 5 Apr 2019 18:25:57 +0200 Subject: [PATCH 255/273] more work --- frontera/contrib/backends/hbase/domaincache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py index d9f5681b1..8729e4ff7 100644 --- a/frontera/contrib/backends/hbase/domaincache.py +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -22,8 +22,8 @@ class LRUCache(Cache): """Least Recently Used (LRU) cache implementation.""" - def __init__(self, maxsize, missing=None, getsizeof=None): - Cache.__init__(self, maxsize, missing=missing, getsizeof=getsizeof) + def __init__(self, maxsize, getsizeof=None): + Cache.__init__(self, maxsize, getsizeof=getsizeof) self.__order = collections.OrderedDict() def __getitem__(self, key, cache_getitem=Cache.__getitem__): From 9bb47c9e8b99e0930c9c7c6d6993e011dc7b966a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 12 Apr 2019 16:58:39 +0200 Subject: [PATCH 256/273] explicit cluster metadata bootstrap in Kafka client --- frontera/contrib/messagebus/kafkabus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index f4761e052..c4ddf8401 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -53,6 +53,9 @@ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): **kwargs ) + # explicitly causing consumer to bootstrap the cluster metadata + self._consumer.topics() + if partition_id is not None: self._partitions = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partitions) From 4e6f547a7025ae8f68644a0501896038d51f42de Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 12 Apr 2019 17:21:05 +0200 Subject: [PATCH 257/273] syntax fixes --- frontera/contrib/backends/remote/messagebus.py | 2 +- frontera/strategy/__init__.py | 3 +-- frontera/strategy/discovery/__init__.py | 10 +++++----- tox.ini | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index 4d5b564b1..b9cdf63c9 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -45,7 +45,7 @@ def frontier_stop(self): self.consumer.close() def add_seeds(self, seeds): - raise NotImplemented("The seeds addition using spider log isn't allowed") + raise NotImplementedError("The seeds addition using spider log isn't allowed") def page_crawled(self, response): host_fprint = get_host_fprint(response) diff --git a/frontera/strategy/__init__.py b/frontera/strategy/__init__.py index 2c9c793a2..784f245a0 100644 --- a/frontera/strategy/__init__.py +++ b/frontera/strategy/__init__.py @@ -138,9 +138,8 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No def refresh_states(self, requests): """ - Retrieves states for all requests from storage. ` + Retrieves states for all requests from storage. if requests is not a list of :class:`Request ` objects. - :param requests: list(:class:`Request `) or a single :class:`Request ` """ self._states_context.refresh_and_keep(requests) diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py index 0a615938a..7ee8bb4ac 100644 --- a/frontera/strategy/discovery/__init__.py +++ b/frontera/strategy/discovery/__init__.py @@ -41,8 +41,8 @@ def is_accessible_domain(domain): def is_domain_to_ignore(domain, max_pages): - return (not is_accessible_domain(domain) or 'banned' in domain or - domain.setdefault('queued_pages', 0) >= max_pages) + return (not is_accessible_domain(domain) or 'banned' in domain + or domain.setdefault('queued_pages', 0) >= max_pages) def justify_request_score_by_hostname(hostname, score): @@ -483,9 +483,9 @@ def _get_domain_after_redirects(self, request): origin_netloc = urlsplit(origin_url).netloc origin_2nd_name, origin_domain = self._get_domain(origin_netloc) - if redirect_urls and (b'robots' in request.meta or - b'sitemap' in request.meta or - b'home' in request.meta): + if redirect_urls and (b'robots' in request.meta + or b'sitemap' in request.meta + or b'home' in request.meta): final_netloc = urlsplit(redirect_urls[-1]).netloc if final_netloc != origin_netloc: origin_redirects = origin_domain.setdefault('redirect_to', set()) diff --git a/tox.ini b/tox.ini index 705b27b61..97b4627df 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ commands = flake8 setup.py frontera # Options for flake8 [flake8] -ignore = E265,E501,F401,W391,W292,E226 +ignore = E265,E501,F401,W391,W292,E226,W504,W605 exclude = frontera/_version.py,versioneer.py,docs/source/conf.py,frontera/contrib/backends/opic/discovery.py From 1e951c6d7c8dc911c87241d1eb1be5a22cb8ee2a Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 12 Apr 2019 17:30:10 +0200 Subject: [PATCH 258/273] line breaks --- frontera/strategy/discovery/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py index 7ee8bb4ac..9f0e77f07 100644 --- a/frontera/strategy/discovery/__init__.py +++ b/frontera/strategy/discovery/__init__.py @@ -41,8 +41,7 @@ def is_accessible_domain(domain): def is_domain_to_ignore(domain, max_pages): - return (not is_accessible_domain(domain) or 'banned' in domain - or domain.setdefault('queued_pages', 0) >= max_pages) + return (not is_accessible_domain(domain) or 'banned' in domain or domain.setdefault('queued_pages', 0) >= max_pages) def justify_request_score_by_hostname(hostname, score): @@ -483,9 +482,7 @@ def _get_domain_after_redirects(self, request): origin_netloc = urlsplit(origin_url).netloc origin_2nd_name, origin_domain = self._get_domain(origin_netloc) - if redirect_urls and (b'robots' in request.meta - or b'sitemap' in request.meta - or b'home' in request.meta): + if redirect_urls and (b'robots' in request.meta or b'sitemap' in request.meta or b'home' in request.meta): final_netloc = urlsplit(redirect_urls[-1]).netloc if final_netloc != origin_netloc: origin_redirects = origin_domain.setdefault('redirect_to', set()) From 5762a2658ee1d8c57a8edb3aa0cf4ac8df1b7a4e Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Fri, 12 Apr 2019 17:31:03 +0200 Subject: [PATCH 259/273] notimplemented->error --- frontera/worker/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/stats.py b/frontera/worker/stats.py index e3d7c7a76..f5dee4aed 100644 --- a/frontera/worker/stats.py +++ b/frontera/worker/stats.py @@ -51,7 +51,7 @@ def get_stats_tags(self, *args, **kwargs): - 'source' - source type of the metric, one of ['sw', 'dbw', 'spider'] - 'partition_id' (optionally) - specific partition id """ - raise NotImplemented("Please define the method in a child class") + raise NotImplementedError("Please define the method in a child class") @property def _stats_key_prefix(self): From befc86642ab98dd0e78efa62d30a208677d835a1 Mon Sep 17 00:00:00 2001 From: liho00 <34411102+liho00@users.noreply.github.com> Date: Wed, 3 Jul 2019 19:55:05 +0800 Subject: [PATCH 260/273] update models.py Solving the problem of 'Decode spider logs self.body return none error' https://github.com/scrapinghub/frontera/issues/374 --- frontera/core/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontera/core/models.py b/frontera/core/models.py index 0b1c37423..ba59a0b4b 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -80,6 +80,7 @@ def body(self): return self._body def __str__(self): + self._body = None if self.body is None else self.body[:20] return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, str(self.meta), str(self.body[:20]), str(self.cookies), str(self.headers)) @@ -160,6 +161,7 @@ def meta(self): "is not tied to any request") def __str__(self): + self._body = None if self.body is None else self.body[:20] return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, id(self), self.status_code, self.url, str(self.meta), From 2457357bb33f9670a8a337fa7818a76fb5b5c66e Mon Sep 17 00:00:00 2001 From: liho00 <34411102+liho00@users.noreply.github.com> Date: Wed, 3 Jul 2019 21:57:25 +0800 Subject: [PATCH 261/273] Update models.py --- frontera/core/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontera/core/models.py b/frontera/core/models.py index ba59a0b4b..ef0e3709c 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -82,7 +82,7 @@ def body(self): def __str__(self): self._body = None if self.body is None else self.body[:20] return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, - str(self.meta), str(self.body[:20]), + str(self.meta), str(self.body), str(self.cookies), str(self.headers)) def __hash__(self): @@ -165,6 +165,6 @@ def __str__(self): return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, id(self), self.status_code, self.url, str(self.meta), - str(self.body[:20]), str(self.headers)) + str(self.body), str(self.headers)) __repr__ = __str__ From 13efd27cf8bbd7e2d1fdf24b05ba4524df11fe57 Mon Sep 17 00:00:00 2001 From: liho00 <34411102+liho00@users.noreply.github.com> Date: Wed, 3 Jul 2019 23:38:48 +0800 Subject: [PATCH 262/273] Update models.py --- frontera/core/models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/frontera/core/models.py b/frontera/core/models.py index ef0e3709c..3266292f8 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -80,9 +80,8 @@ def body(self): return self._body def __str__(self): - self._body = None if self.body is None else self.body[:20] return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, - str(self.meta), str(self.body), + str(self.meta), str(self.body[:20]) if self.body is not None else None, str(self.cookies), str(self.headers)) def __hash__(self): @@ -161,10 +160,9 @@ def meta(self): "is not tied to any request") def __str__(self): - self._body = None if self.body is None else self.body[:20] return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, id(self), self.status_code, self.url, str(self.meta), - str(self.body), str(self.headers)) + str(self.body[:20]) if self.body is not None else None, str(self.headers)) __repr__ = __str__ From 59050bf5d2204d586f2c7381711fefcfa6b60d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Wed, 23 Oct 2019 18:37:27 +0200 Subject: [PATCH 263/273] Bump to Python 3.6 --- .travis.yml | 7 +++++++ README.md | 2 -- setup.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5dfdb5587..0bbf5ef93 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,13 @@ matrix: - mysql - postgresql - redis-server + - python: 3.6 + env: TOXENV=py36 + services: + - docker + - mysql + - postgresql + - redis-server install: - pip install -U tox wheel codecov diff --git a/README.md b/README.md index c8df8bb56..3ad356d7e 100644 --- a/README.md +++ b/README.md @@ -38,5 +38,3 @@ $ pip install frontera Join our Google group at https://groups.google.com/a/scrapinghub.com/forum/#!forum/frontera or check GitHub issues and pull requests. - - diff --git a/setup.py b/setup.py index df0e30a3c..e00c6b88b 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Application Frameworks', 'Topic :: Software Development :: Libraries :: Python Modules', From f05104e641ede8b177ef48e2abd0f017fc8c7e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Wed, 23 Oct 2019 18:52:10 +0200 Subject: [PATCH 264/273] Use thriftpy2 --- frontera/contrib/backends/hbase/utils.py | 4 ++-- setup.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/frontera/contrib/backends/hbase/utils.py b/frontera/contrib/backends/hbase/utils.py index 0f58a55cb..6dc862e92 100644 --- a/frontera/contrib/backends/hbase/utils.py +++ b/frontera/contrib/backends/hbase/utils.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from happybase import Batch -from thriftpy.transport import TTransportException +from thriftpy2.transport import TTransportException import logging @@ -19,4 +19,4 @@ def send(self): self.logger.exception("Exception happened during batch persistence") self.logger.warning("Cleaning up the batch") self._reset_mutations() - pass \ No newline at end of file + pass diff --git a/setup.py b/setup.py index e00c6b88b..d029cde90 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,8 @@ install_requires=[ 'six>=1.8.0', 'w3lib>=1.15.0', - 'cityhash>=0.1.7' + 'cityhash>=0.1.7', + 'thriftpy2' ], extras_require={ 'sql': [ From e4885cbf4154eea79949f20fb90605d04b0d6097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Wed, 23 Oct 2019 19:06:46 +0200 Subject: [PATCH 265/273] Add Python 3.7 --- .travis.yml | 7 +++++++ setup.py | 1 + 2 files changed, 8 insertions(+) diff --git a/.travis.yml b/.travis.yml index 0bbf5ef93..837e16675 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,13 @@ matrix: - mysql - postgresql - redis-server + - python: 3.7 + env: TOXENV=py37 + services: + - docker + - mysql + - postgresql + - redis-server install: - pip install -U tox wheel codecov diff --git a/setup.py b/setup.py index d029cde90..e5e87805c 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Application Frameworks', 'Topic :: Software Development :: Libraries :: Python Modules', From 4a4d8b8c814273d76312e580fa72254a54a6b544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Wed, 23 Oct 2019 19:13:11 +0200 Subject: [PATCH 266/273] Make test_zmq_message_bus() flaky --- requirements/tests.txt | 1 + tests/test_message_bus.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/requirements/tests.txt b/requirements/tests.txt index 0c63dc53c..0fa21a808 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -1,3 +1,4 @@ +flaky pytest>=2.6.4 PyMySQL>=0.6.3 psycopg2>=2.5.4 diff --git a/tests/test_message_bus.py b/tests/test_message_bus.py index 5293c2445..1addc6ca3 100644 --- a/tests/test_message_bus.py +++ b/tests/test_message_bus.py @@ -4,6 +4,7 @@ from frontera.contrib.messagebus.zeromq import MessageBus as ZeroMQMessageBus from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus from frontera.utils.fingerprint import sha1 +from flaky import flaky from kafka import KafkaClient from random import randint from time import sleep @@ -234,6 +235,7 @@ def __init__(self): super(IPv6MessageBusTester, self).__init__(settings) +@flaky def test_zmq_message_bus(): """ Test MessageBus with default settings, IPv6 and Star as ZMQ_ADDRESS From 4e49f76d209c274f3fbd21d4030c323da478f4e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Wed, 23 Oct 2019 19:35:00 +0200 Subject: [PATCH 267/273] Add .env to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 391d32d6e..4b9aaea27 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,6 @@ docs/_build/ # PyBuilder target/ + +# Virtualenv +.env/ From 5c5972bf7f3e742b8379980baa60df3d8462f7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Thu, 24 Oct 2019 16:30:39 +0200 Subject: [PATCH 268/273] Add badges --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c8df8bb56..41f34054e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # Frontera +[![pypi](https://img.shields.io/pypi/v/frontera)](https://pypi.org/project/frontera/) +[![Build Status](https://travis-ci.org/scrapinghub/frontera.svg?branch=master)](https://travis-ci.org/scrapinghub/frontera) +[![codecov](https://codecov.io/gh/scrapinghub/frontera/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/frontera) + ## Overview Frontera is a web crawling framework consisting of [crawl frontier](http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html), and distribution/scaling primitives, allowing to build a large scale online web crawler. From 482b4d5b53ce9698a376b25632e2ed06f9cb71cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Thu, 24 Oct 2019 16:40:19 +0200 Subject: [PATCH 269/273] Add pyversions badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 41f34054e..790823ecb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Frontera [![pypi](https://img.shields.io/pypi/v/frontera)](https://pypi.org/project/frontera/) +[![pypi](https://img.shields.io/pypi/pyversions/frontera.svg)](https://pypi.org/project/frontera/) [![Build Status](https://travis-ci.org/scrapinghub/frontera.svg?branch=master)](https://travis-ci.org/scrapinghub/frontera) [![codecov](https://codecov.io/gh/scrapinghub/frontera/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/frontera) From e6bb8daa3308462ae6c2e86142ce9c1c239a4bcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Thu, 24 Oct 2019 16:47:30 +0200 Subject: [PATCH 270/273] Rename to "python versions" --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 790823ecb..478a1aff8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Frontera [![pypi](https://img.shields.io/pypi/v/frontera)](https://pypi.org/project/frontera/) -[![pypi](https://img.shields.io/pypi/pyversions/frontera.svg)](https://pypi.org/project/frontera/) +[![python versions](https://img.shields.io/pypi/pyversions/frontera.svg)](https://pypi.org/project/frontera/) [![Build Status](https://travis-ci.org/scrapinghub/frontera.svg?branch=master)](https://travis-ci.org/scrapinghub/frontera) [![codecov](https://codecov.io/gh/scrapinghub/frontera/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/frontera) From c5a70013911b71c3451df573b2208b4aec6aac95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Fri, 25 Oct 2019 10:58:41 +0200 Subject: [PATCH 271/273] Move thriftpy2 to test_requires --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e5e87805c..e870cb505 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,7 @@ install_requires=[ 'six>=1.8.0', 'w3lib>=1.15.0', - 'cityhash>=0.1.7', - 'thriftpy2' + 'cityhash>=0.1.7' ], extras_require={ 'sql': [ @@ -94,6 +93,7 @@ "psycopg2>=2.5.4", "scrapy>=0.24", "tldextract>=1.5.1", + 'thriftpy2', "SQLAlchemy>=1.0.0", "cachetools", "mock", From ebb4d322ff3bf74adba43825c0575cbc539b39c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susannah=20Klane=C4=8Dek?= Date: Fri, 25 Oct 2019 12:06:01 +0200 Subject: [PATCH 272/273] Also include thriftpy2 as an hbase requirement --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e870cb505..11bff20f3 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,8 @@ 'tldextract>=1.5.1', ], 'hbase': [ - 'happybase>=1.0.0' + 'happybase>=1.0.0', + 'thriftpy2' ], 'zeromq': [ 'pyzmq', From d5f5905e9f35759d1859a2bc877fba8e9cfbe89e Mon Sep 17 00:00:00 2001 From: Alexey Shkarupin Date: Fri, 29 Nov 2019 11:10:00 +0200 Subject: [PATCH 273/273] add assertion error for easier debugging --- frontera/contrib/messagebus/kafkabus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index c4ddf8401..10e5d5b53 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -190,7 +190,8 @@ def __init__(self, messagebus): def consumer(self, partition_id): c = Consumer(self._location, self._enable_ssl, self._cert_path, self._topic, self._general_group, partition_id) - assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions + assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions, \ + "Number of kafka topic partitions doesn't match value in config for spider feed" return c def available_partitions(self):