diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml new file mode 100644 index 00000000..a0a861d8 --- /dev/null +++ b/.github/workflows/python-formatting.yml @@ -0,0 +1,28 @@ +name: Python Formatting Check + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install black + - name: Run formatting check + run: | + make ck-format diff --git a/.gitignore b/.gitignore index 14d49f48..174149f3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ *.diff .*.sw* /brozzler.egg-info/ +venv +.idea diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..f99dcc97 --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +.PHONY: format +format: + venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 . + +.PHONY: ck-format +ck-format: + venv/bin/black --check . diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 3be20f2c..5040e698 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -19,33 +19,41 @@ import logging from pkg_resources import get_distribution as _get_distribution -__version__ = _get_distribution('brozzler').version + +__version__ = _get_distribution("brozzler").version + class ShutdownRequested(Exception): pass + class NothingToClaim(Exception): pass + class CrawlStopped(Exception): pass + class PageInterstitialShown(Exception): pass + class ProxyError(Exception): pass + class ReachedTimeLimit(Exception): pass + class ReachedLimit(Exception): def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): import json + if http_error: if "warcprox-meta" in http_error.headers: - self.warcprox_meta = json.loads( - http_error.headers["warcprox-meta"]) + self.warcprox_meta = json.loads(http_error.headers["warcprox-meta"]) else: self.warcprox_meta = None self.http_payload = http_error.read() @@ -55,28 +63,39 @@ def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): def __repr__(self): return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % ( - self.warcprox_meta if hasattr(self, 'warcprox_meta') else None, - self.http_payload if hasattr(self, 'http_payload') else None) + self.warcprox_meta if hasattr(self, "warcprox_meta") else None, + self.http_payload if hasattr(self, "http_payload") else None, + ) def __str__(self): return self.__repr__() + # monkey-patch log levels TRACE and NOTICE logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2 + + def _logger_trace(self, msg, *args, **kwargs): if self.isEnabledFor(logging.TRACE): self._log(logging.TRACE, msg, args, **kwargs) + + logging.Logger.trace = _logger_trace logging.trace = logging.root.trace -logging.addLevelName(logging.TRACE, 'TRACE') +logging.addLevelName(logging.TRACE, "TRACE") logging.NOTICE = (logging.INFO + logging.WARN) // 2 + + def _logger_notice(self, msg, *args, **kwargs): if self.isEnabledFor(logging.NOTICE): self._log(logging.NOTICE, msg, args, **kwargs) + + logging.Logger.notice = _logger_notice logging.notice = logging.root.notice -logging.addLevelName(logging.NOTICE, 'NOTICE') +logging.addLevelName(logging.NOTICE, "NOTICE") + # see https://github.com/internetarchive/brozzler/issues/91 def _logging_handler_handle(self, record): @@ -91,9 +110,13 @@ def _logging_handler_handle(self, record): except: pass return rv + + logging.Handler.handle = _logging_handler_handle _behaviors = None + + def behaviors(behaviors_dir=None): """Return list of JS behaviors loaded from YAML file. @@ -101,35 +124,43 @@ def behaviors(behaviors_dir=None): `js-templates/`. Defaults to brozzler dir. """ import os, yaml, string + global _behaviors if _behaviors is None: d = behaviors_dir or os.path.dirname(__file__) - behaviors_yaml = os.path.join(d, 'behaviors.yaml') + behaviors_yaml = os.path.join(d, "behaviors.yaml") with open(behaviors_yaml) as fin: _behaviors = yaml.safe_load(fin) return _behaviors + def behavior_script(url, template_parameters=None, behaviors_dir=None): - ''' + """ Returns the javascript behavior string populated with template_parameters. - ''' + """ import re, logging, json + for behavior in behaviors(behaviors_dir=behaviors_dir): - if re.match(behavior['url_regex'], url): + if re.match(behavior["url_regex"], url): parameters = dict() - if 'default_parameters' in behavior: - parameters.update(behavior['default_parameters']) + if "default_parameters" in behavior: + parameters.update(behavior["default_parameters"]) if template_parameters: parameters.update(template_parameters) template = jinja2_environment(behaviors_dir).get_template( - behavior['behavior_js_template']) + behavior["behavior_js_template"] + ) script = template.render(parameters) logging.info( - 'using template=%r populated with parameters=%r for %r', - behavior['behavior_js_template'], json.dumps(parameters), url) + "using template=%r populated with parameters=%r for %r", + behavior["behavior_js_template"], + json.dumps(parameters), + url, + ) return script return None + class ThreadExceptionGate: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -142,8 +173,7 @@ def __init__(self, thread): def __enter__(self): assert self.thread == threading.current_thread() if self.pending_exception: - self.logger.info( - 'raising pending exception %s', self.pending_exception) + self.logger.info("raising pending exception %s", self.pending_exception) tmp = self.pending_exception self.pending_exception = None raise tmp @@ -154,25 +184,32 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): assert self.thread == threading.current_thread() self.ok_to_raise.clear() - return False # don't swallow exception + return False # don't swallow exception def queue_exception(self, e): with self.lock: if self.pending_exception: self.logger.warning( - '%r already pending for thread %r, discarding %r', - self.pending_exception, self.thread, e) + "%r already pending for thread %r, discarding %r", + self.pending_exception, + self.thread, + e, + ) else: self.pending_exception = e def __repr__(self): - return '' % self.thread + return "" % self.thread + import threading + _thread_exception_gates = {} _thread_exception_gates_lock = threading.Lock() + + def thread_exception_gate(thread=None): - ''' + """ Returns a `ThreadExceptionGate` for `thread` (current thread by default). `ThreadExceptionGate` is a context manager which allows exceptions to be @@ -191,7 +228,7 @@ def thread_exception_gate(thread=None): is queued, and raised immediately if and when the thread enters the context. Only one exception will be queued this way at a time, others are discarded. - ''' + """ if not thread: thread = threading.current_thread() @@ -201,10 +238,12 @@ def thread_exception_gate(thread=None): return _thread_exception_gates[thread] + thread_accept_exceptions = thread_exception_gate + def thread_raise(thread, exctype): - ''' + """ Raises or queues the exception `exctype` for the thread `thread`. See the documentation on the function `thread_exception_gate()` for more @@ -218,40 +257,43 @@ def thread_raise(thread, exctype): Raises: TypeError if `exctype` is not a class ValueError, SystemError in case of unexpected problems - ''' + """ import ctypes, inspect, threading, logging if not inspect.isclass(exctype): raise TypeError( - 'cannot raise %s, only exception types can be raised (not ' - 'instances)' % exctype) + "cannot raise %s, only exception types can be raised (not " + "instances)" % exctype + ) gate = thread_exception_gate(thread) with gate.lock: if gate.ok_to_raise.is_set() and thread.is_alive(): gate.ok_to_raise.clear() - logging.info('raising %s in thread %s', exctype, thread) + logging.info("raising %s in thread %s", exctype, thread) res = ctypes.pythonapi.PyThreadState_SetAsyncExc( - ctypes.c_long(thread.ident), ctypes.py_object(exctype)) + ctypes.c_long(thread.ident), ctypes.py_object(exctype) + ) if res == 0: - raise ValueError( - 'invalid thread id? thread.ident=%s' % thread.ident) + raise ValueError("invalid thread id? thread.ident=%s" % thread.ident) elif res != 1: # if it returns a number greater than one, you're in trouble, # and you should call it again with exc=NULL to revert the effect ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0) - raise SystemError('PyThreadState_SetAsyncExc failed') + raise SystemError("PyThreadState_SetAsyncExc failed") else: - logging.info('queueing %s for thread %s', exctype, thread) + logging.info("queueing %s for thread %s", exctype, thread) gate.queue_exception(exctype) + def sleep(duration): - ''' + """ Sleeps for duration seconds in increments of 0.5 seconds. Use this so that the sleep can be interrupted by thread_raise(). - ''' + """ import time + start = time.time() while True: elapsed = time.time() - start @@ -259,32 +301,41 @@ def sleep(duration): break time.sleep(min(duration - elapsed, 0.5)) + _jinja2_env = None + + def jinja2_environment(behaviors_dir=None): global _jinja2_env if not _jinja2_env: import os, jinja2, json + if behaviors_dir: - _loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir, - 'js-templates')) + _loader = jinja2.FileSystemLoader( + os.path.join(behaviors_dir, "js-templates") + ) else: - _loader=jinja2.PackageLoader('brozzler', 'js-templates') + _loader = jinja2.PackageLoader("brozzler", "js-templates") _jinja2_env = jinja2.Environment(loader=_loader, auto_reload=False) - _jinja2_env.filters['json'] = json.dumps + _jinja2_env.filters["json"] = json.dumps return _jinja2_env + import urlcanon + + def _remove_query(url): - url.question_mark = b'' - url.query = b'' + url.question_mark = b"" + url.query = b"" + + # XXX chop off path after last slash?? -site_surt_canon = urlcanon.Canonicalizer( - urlcanon.semantic.steps + [_remove_query]) +site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query]) import doublethink import datetime -EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( - tzinfo=doublethink.UTC) + +EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC) # we could make this configurable if there's a good reason MAX_PAGE_FAILURES = 3 @@ -294,10 +345,31 @@ def _remove_query(url): from brozzler.frontier import RethinkDbFrontier from brozzler.browser import Browser, BrowserPool, BrowsingException from brozzler.model import ( - new_job, new_job_file, new_site, Job, Page, Site, InvalidJobConf) + new_job, + new_job_file, + new_site, + Job, + Page, + Site, + InvalidJobConf, +) from brozzler.cli import suggest_default_chrome_exe -__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots', - 'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException', - 'new_job', 'new_site', 'Job', 'new_job_file', 'InvalidJobConf', - 'sleep', 'thread_accept_exceptions', 'thread_raise'] +__all__ = [ + "Page", + "Site", + "BrozzlerWorker", + "is_permitted_by_robots", + "RethinkDbFrontier", + "Browser", + "BrowserPool", + "BrowsingException", + "new_job", + "new_site", + "Job", + "new_job_file", + "InvalidJobConf", + "sleep", + "thread_accept_exceptions", + "thread_raise", +] diff --git a/brozzler/browser.py b/brozzler/browser.py index b9854aad..7ae5828d 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,4 +1,4 @@ -''' +""" brozzler/browser.py - manages the browsers for brozzler Copyright (C) 2014-2023 Internet Archive @@ -14,7 +14,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import time @@ -33,30 +33,35 @@ import socket import urlcanon + class BrowsingException(Exception): pass + class NoBrowsersAvailable(Exception): pass + class BrowsingTimeout(BrowsingException): pass + class BrowserPool: - ''' + """ Manages pool of browsers. Automatically chooses available port for the debugging protocol. - ''' - logger = logging.getLogger(__module__ + '.' + __qualname__) + """ + + logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, size=3, **kwargs): - ''' + """ Initializes the pool. Args: size: size of pool (default 3) **kwargs: arguments for Browser(...) - ''' + """ self.size = size self.kwargs = kwargs self._in_use = set() @@ -65,7 +70,7 @@ def __init__(self, size=3, **kwargs): def _fresh_browser(self): # choose available port sock = socket.socket() - sock.bind(('0.0.0.0', 0)) + sock.bind(("0.0.0.0", 0)) port = sock.getsockname()[1] sock.close() @@ -73,12 +78,12 @@ def _fresh_browser(self): return browser def acquire_multi(self, n=1): - ''' + """ Returns a list of up to `n` browsers. Raises: NoBrowsersAvailable if none available - ''' + """ browsers = [] with self._lock: if len(self._in_use) >= self.size: @@ -90,7 +95,7 @@ def acquire_multi(self, n=1): return browsers def acquire(self): - ''' + """ Returns an available instance. Returns: @@ -98,7 +103,7 @@ def acquire(self): Raises: NoBrowsersAvailable if none available - ''' + """ with self._lock: if len(self._in_use) >= self.size: raise NoBrowsersAvailable @@ -120,8 +125,8 @@ def release_all(self, browsers): def shutdown_now(self): self.logger.info( - 'shutting down browser pool (%s browsers in use)', - len(self._in_use)) + "shutting down browser pool (%s browsers in use)", len(self._in_use) + ) with self._lock: for browser in self._in_use: browser.stop() @@ -132,8 +137,9 @@ def num_available(self): def num_in_use(self): return len(self._in_use) + class WebsockReceiverThread(threading.Thread): - logger = logging.getLogger(__module__ + '.' + __qualname__) + logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, websock, name=None, daemon=True): super().__init__(name=name, daemon=daemon) @@ -149,7 +155,7 @@ def __init__(self, websock, name=None, daemon=True): self.is_open = False self.got_page_load_event = None - self.page_status = None # Loaded page HTTP status code + self.page_status = None # Loaded page HTTP status code self.reached_limit = None self.on_request = None @@ -175,50 +181,54 @@ def _on_open(self, websock): self.is_open = True def _on_error(self, websock, e): - ''' + """ Raises BrowsingException in the thread that created this instance. - ''' - if isinstance(e, ( - websocket.WebSocketConnectionClosedException, - ConnectionResetError)): - self.logger.error('websocket closed, did chrome die?') + """ + if isinstance( + e, (websocket.WebSocketConnectionClosedException, ConnectionResetError) + ): + self.logger.error("websocket closed, did chrome die?") else: - self.logger.error( - 'exception from websocket receiver thread', - exc_info=1) + self.logger.error("exception from websocket receiver thread", exc_info=1) brozzler.thread_raise(self.calling_thread, BrowsingException) def run(self): # ping_timeout is used as the timeout for the call to select.select() # in addition to its documented purpose, and must have a value to avoid # hangs in certain situations - self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), - ping_timeout=0.5) + self.websock.run_forever( + sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), ping_timeout=0.5 + ) def _on_message(self, websock, message): try: self._handle_message(websock, message) except: self.logger.error( - 'uncaught exception in _handle_message message=%s', - message, exc_info=True) + "uncaught exception in _handle_message message=%s", + message, + exc_info=True, + ) def _network_response_received(self, message): - status = message['params']['response'].get('status') - if (status == 420 and 'Warcprox-Meta' in CaseInsensitiveDict( - message['params']['response']['headers'])): + status = message["params"]["response"].get("status") + if status == 420 and "Warcprox-Meta" in CaseInsensitiveDict( + message["params"]["response"]["headers"] + ): if not self.reached_limit: - warcprox_meta = json.loads(CaseInsensitiveDict( - message['params']['response']['headers'])['Warcprox-Meta']) - self.reached_limit = brozzler.ReachedLimit( - warcprox_meta=warcprox_meta) - self.logger.info('reached limit %s', self.reached_limit) - brozzler.thread_raise( - self.calling_thread, brozzler.ReachedLimit) + warcprox_meta = json.loads( + CaseInsensitiveDict(message["params"]["response"]["headers"])[ + "Warcprox-Meta" + ] + ) + self.reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta) + self.logger.info("reached limit %s", self.reached_limit) + brozzler.thread_raise(self.calling_thread, brozzler.ReachedLimit) else: self.logger.info( - 'reached limit but self.reached_limit is already set, ' - 'assuming the calling thread is already handling this') + "reached limit but self.reached_limit is already set, " + "assuming the calling thread is already handling this" + ) if self.on_response: self.on_response(message) @@ -226,75 +236,92 @@ def _network_response_received(self, message): self.page_status = status def _javascript_dialog_opening(self, message): - self.logger.info('javascript dialog opened: %s', message) - if message['params']['type'] == 'alert': + self.logger.info("javascript dialog opened: %s", message) + if message["params"]["type"] == "alert": accept = True else: accept = False self.websock.send( - json.dumps(dict( - id=0, method='Page.handleJavaScriptDialog', - params={'accept': accept}), separators=',:')) + json.dumps( + dict( + id=0, + method="Page.handleJavaScriptDialog", + params={"accept": accept}, + ), + separators=",:", + ) + ) def _handle_message(self, websock, json_message): message = json.loads(json_message) - if 'method' in message: - if message['method'] == 'Page.loadEventFired': + if "method" in message: + if message["method"] == "Page.loadEventFired": self.got_page_load_event = datetime.datetime.utcnow() - elif message['method'] == 'Network.responseReceived': + elif message["method"] == "Network.responseReceived": self._network_response_received(message) - elif message['method'] == 'Network.requestWillBeSent': + elif message["method"] == "Network.requestWillBeSent": if self.on_request: self.on_request(message) - elif message['method'] == 'Page.interstitialShown': + elif message["method"] == "Page.interstitialShown": # AITFIVE-1529: handle http auth # we should kill the browser when we receive Page.interstitialShown and # consider the page finished, until this is fixed: # https://bugs.chromium.org/p/chromium/issues/detail?id=764505 - self.logger.info('Page.interstialShown (likely unsupported http auth request)') - brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown) - elif message['method'] == 'Inspector.targetCrashed': - self.logger.error( - '''chrome tab went "aw snap" or "he's dead jim"!''') + self.logger.info( + "Page.interstialShown (likely unsupported http auth request)" + ) + brozzler.thread_raise( + self.calling_thread, brozzler.PageInterstitialShown + ) + elif message["method"] == "Inspector.targetCrashed": + self.logger.error("""chrome tab went "aw snap" or "he's dead jim"!""") brozzler.thread_raise(self.calling_thread, BrowsingException) - elif message['method'] == 'Console.messageAdded': + elif message["method"] == "Console.messageAdded": self.logger.debug( - 'console.%s %s', message['params']['message']['level'], - message['params']['message']['text']) - elif message['method'] == 'Runtime.exceptionThrown': - self.logger.debug('uncaught exception: %s', message) - elif message['method'] == 'Page.javascriptDialogOpening': + "console.%s %s", + message["params"]["message"]["level"], + message["params"]["message"]["text"], + ) + elif message["method"] == "Runtime.exceptionThrown": + self.logger.debug("uncaught exception: %s", message) + elif message["method"] == "Page.javascriptDialogOpening": self._javascript_dialog_opening(message) - elif (message['method'] == 'Network.loadingFailed' - and 'params' in message and 'errorText' in message['params'] - and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'): + elif ( + message["method"] == "Network.loadingFailed" + and "params" in message + and "errorText" in message["params"] + and message["params"]["errorText"] == "net::ERR_PROXY_CONNECTION_FAILED" + ): brozzler.thread_raise(self.calling_thread, brozzler.ProxyError) - elif message['method'] == 'ServiceWorker.workerVersionUpdated': + elif message["method"] == "ServiceWorker.workerVersionUpdated": if self.on_service_worker_version_updated: self.on_service_worker_version_updated(message) # else: # self.logger.debug("%s %s", message["method"], json_message) - elif 'result' in message: - if message['id'] in self._result_messages: - self._result_messages[message['id']] = message - # else: - # self.logger.debug("%s", json_message) - # else: - # self.logger.debug("%s", json_message) + elif "result" in message: + if message["id"] in self._result_messages: + self._result_messages[message["id"]] = message + + # else: + # self.logger.debug("%s", json_message) + # else: + # self.logger.debug("%s", json_message) + class Browser: - ''' + """ Manages an instance of Chrome for browsing pages. - ''' - logger = logging.getLogger(__module__ + '.' + __qualname__) + """ + + logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, **kwargs): - ''' + """ Initializes the Browser. Args: **kwargs: arguments for Chrome(...) - ''' + """ self.chrome = Chrome(**kwargs) self.websock_url = None self.websock = None @@ -311,9 +338,9 @@ def __exit__(self, *args): self.stop() def _wait_for(self, callback, timeout=None): - ''' + """ Spins until callback() returns truthy. - ''' + """ start = time.time() while True: if callback(): @@ -321,112 +348,140 @@ def _wait_for(self, callback, timeout=None): elapsed = time.time() - start if timeout and elapsed > timeout: raise BrowsingTimeout( - 'timed out after %.1fs waiting for: %s' % ( - elapsed, callback)) + "timed out after %.1fs waiting for: %s" % (elapsed, callback) + ) brozzler.sleep(self._wait_interval) def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self._command_id) - kwargs['id'] = msg_id - msg = json.dumps(kwargs, separators=',:') + kwargs["id"] = msg_id + msg = json.dumps(kwargs, separators=",:") logging.log( - logging.TRACE if suppress_logging else logging.DEBUG, - 'sending message to %s: %s', self.websock, msg) + logging.TRACE if suppress_logging else logging.DEBUG, + "sending message to %s: %s", + self.websock, + msg, + ) self.websock.send(msg) return msg_id def start(self, **kwargs): - ''' + """ Starts chrome if it's not running. Args: **kwargs: arguments for self.chrome.start(...) - ''' + """ if not self.is_running(): self.websock_url = self.chrome.start(**kwargs) self.websock = websocket.WebSocketApp(self.websock_url) self.websock_thread = WebsockReceiverThread( - self.websock, name='WebsockThread:%s' % self.chrome.port) + self.websock, name="WebsockThread:%s" % self.chrome.port + ) self.websock_thread.start() self._wait_for(lambda: self.websock_thread.is_open, timeout=30) # tell browser to send us messages we're interested in - self.send_to_chrome(method='Network.enable') - self.send_to_chrome(method='Page.enable') + self.send_to_chrome(method="Network.enable") + self.send_to_chrome(method="Page.enable") # Enable Console & Runtime output only when debugging. # After all, we just print these events with debug(), we don't use # them in Brozzler logic. if self.logger.isEnabledFor(logging.DEBUG): - self.send_to_chrome(method='Console.enable') - self.send_to_chrome(method='Runtime.enable') - self.send_to_chrome(method='ServiceWorker.enable') - self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') + self.send_to_chrome(method="Console.enable") + self.send_to_chrome(method="Runtime.enable") + self.send_to_chrome(method="ServiceWorker.enable") + self.send_to_chrome(method="ServiceWorker.setForceUpdateOnPageLoad") # disable google analytics and amp analytics self.send_to_chrome( - method='Network.setBlockedURLs', - params={'urls': ['*google-analytics.com/analytics.js*', - '*google-analytics.com/ga.js*', - '*google-analytics.com/ga_exp.js*', - '*google-analytics.com/urchin.js*', - '*google-analytics.com/collect*', - '*google-analytics.com/r/collect*', - '*google-analytics.com/__utm.gif*', - '*google-analytics.com/gtm/js?*', - '*google-analytics.com/cx/api.js*', - '*cdn.ampproject.org/*/amp-analytics*.js']}) + method="Network.setBlockedURLs", + params={ + "urls": [ + "*google-analytics.com/analytics.js*", + "*google-analytics.com/ga.js*", + "*google-analytics.com/ga_exp.js*", + "*google-analytics.com/urchin.js*", + "*google-analytics.com/collect*", + "*google-analytics.com/r/collect*", + "*google-analytics.com/__utm.gif*", + "*google-analytics.com/gtm/js?*", + "*google-analytics.com/cx/api.js*", + "*cdn.ampproject.org/*/amp-analytics*.js", + ] + }, + ) def stop(self): - ''' + """ Stops chrome if it's running. - ''' + """ try: - if (self.websock and self.websock.sock - and self.websock.sock.connected): - self.logger.info('shutting down websocket connection') + if self.websock and self.websock.sock and self.websock.sock.connected: + self.logger.info("shutting down websocket connection") try: self.websock.close() except BaseException as e: self.logger.error( - 'exception closing websocket %s - %s', - self.websock, e) + "exception closing websocket %s - %s", self.websock, e + ) self.chrome.stop() if self.websock_thread and ( - self.websock_thread != threading.current_thread()): + self.websock_thread != threading.current_thread() + ): self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.error( - '%s still alive 30 seconds after closing %s, will ' - 'forcefully nudge it again', self.websock_thread, - self.websock) + "%s still alive 30 seconds after closing %s, will " + "forcefully nudge it again", + self.websock_thread, + self.websock, + ) self.websock.keep_running = False self.websock_thread.join(timeout=30) if self.websock_thread.is_alive(): self.logger.critical( - '%s still alive 60 seconds after closing %s', - self.websock_thread, self.websock) + "%s still alive 60 seconds after closing %s", + self.websock_thread, + self.websock, + ) self.websock_url = None except: - self.logger.error('problem stopping', exc_info=True) + self.logger.error("problem stopping", exc_info=True) def is_running(self): return self.websock_url is not None def browse_page( - self, page_url, extra_headers=None, - user_agent=None, behavior_parameters=None, behaviors_dir=None, - on_request=None, on_response=None, - on_service_worker_version_updated=None, on_screenshot=None, - username=None, password=None, hashtags=None, - screenshot_full_page=False, skip_extract_outlinks=False, - skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, - page_timeout=300, behavior_timeout=900, - extract_outlinks_timeout=60, download_throughput=-1, stealth=False): - ''' + self, + page_url, + extra_headers=None, + user_agent=None, + behavior_parameters=None, + behaviors_dir=None, + on_request=None, + on_response=None, + on_service_worker_version_updated=None, + on_screenshot=None, + username=None, + password=None, + hashtags=None, + screenshot_full_page=False, + skip_extract_outlinks=False, + skip_visit_hashtags=False, + skip_youtube_dl=False, + simpler404=False, + page_timeout=300, + behavior_timeout=900, + extract_outlinks_timeout=60, + download_throughput=-1, + stealth=False, + ): + """ Browses page in browser. Browser should already be running, i.e. start() should have been @@ -473,54 +528,60 @@ def browse_page( Raises: brozzler.ProxyError: in case of proxy connection error BrowsingException: if browsing the page fails in some other way - ''' + """ if not self.is_running(): - raise BrowsingException('browser has not been started') + raise BrowsingException("browser has not been started") if self.is_browsing: - raise BrowsingException('browser is already busy browsing a page') + raise BrowsingException("browser is already busy browsing a page") self.is_browsing = True if on_request: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response if on_service_worker_version_updated: - self.websock_thread.on_service_worker_version_updated = \ - on_service_worker_version_updated + self.websock_thread.on_service_worker_version_updated = ( + on_service_worker_version_updated + ) try: with brozzler.thread_accept_exceptions(): self.configure_browser( - extra_headers=extra_headers, - user_agent=user_agent, - download_throughput=download_throughput, - stealth=stealth) + extra_headers=extra_headers, + user_agent=user_agent, + download_throughput=download_throughput, + stealth=stealth, + ) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url - if page_url != self.url().split('#')[0]: + if page_url != self.url().split("#")[0]: self.logger.debug( - 'login navigated away from %s; returning!', - page_url) + "login navigated away from %s; returning!", page_url + ) self.navigate_to_page(page_url, timeout=page_timeout) # If the target page HTTP status is 4xx/5xx, there is no point # in running behaviors, screenshot, outlink and hashtag # extraction as we didn't get a valid page. # This is only enabled with option `simpler404`. run_behaviors = True - if simpler404 and (self.websock_thread.page_status is None or - self.websock_thread.page_status >= 400): + if simpler404 and ( + self.websock_thread.page_status is None + or self.websock_thread.page_status >= 400 + ): run_behaviors = False if run_behaviors and behavior_timeout > 0: behavior_script = brozzler.behavior_script( - page_url, behavior_parameters, - behaviors_dir=behaviors_dir) + page_url, behavior_parameters, behaviors_dir=behaviors_dir + ) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: if simpler404: - if self.websock_thread.page_status and \ - self.websock_thread.page_status < 400: + if ( + self.websock_thread.page_status + and self.websock_thread.page_status < 400 + ): self._try_screenshot(on_screenshot, screenshot_full_page) else: self._try_screenshot(on_screenshot, screenshot_full_page) @@ -528,9 +589,7 @@ def browse_page( if not run_behaviors or skip_extract_outlinks: outlinks = [] else: - outlinks = self.extract_outlinks( - timeout=extract_outlinks_timeout - ) + outlinks = self.extract_outlinks(timeout=extract_outlinks_timeout) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks @@ -539,7 +598,7 @@ def browse_page( # more information, raise that one raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: - self.logger.error('websocket closed, did chrome die?') + self.logger.error("websocket closed, did chrome die?") raise BrowsingException(e) finally: self.is_browsing = False @@ -550,21 +609,24 @@ def _try_screenshot(self, on_screenshot, full_page=False): """The browser instance must be scrolled to the top of the page before trying to get a screenshot. """ - self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, - params={'expression': 'window.scroll(0,0)'}) + self.send_to_chrome( + method="Runtime.evaluate", + suppress_logging=True, + params={"expression": "window.scroll(0,0)"}, + ) for i in range(3): try: jpeg_bytes = self.screenshot(full_page) on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: - logging.error('attempt %s/3: %s', i+1, e) + logging.error("attempt %s/3: %s", i + 1, e) def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) - hashtag = (url.hash_sign + url.fragment).decode('utf-8') + hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) @@ -572,84 +634,85 @@ def visit_hashtags(self, page_url, hashtags, outlinks): # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) - self.logger.debug('navigating to hashtag %s', hashtag) + self.logger.debug("navigating to hashtag %s", hashtag) url = urlcanon.whatwg(page_url) - url.hash_sign = b'#' - url.fragment = hashtag[1:].encode('utf-8') - self.send_to_chrome( - method='Page.navigate', params={'url': str(url)}) - time.sleep(5) # um.. wait for idleness or something? + url.hash_sign = b"#" + url.fragment = hashtag[1:].encode("utf-8") + self.send_to_chrome(method="Page.navigate", params={"url": str(url)}) + time.sleep(5) # um.. wait for idleness or something? # take another screenshot? # run behavior again with short timeout? # retrieve outlinks again and append to list? - def configure_browser(self, extra_headers=None, user_agent=None, - download_throughput=-1, stealth=False): + def configure_browser( + self, extra_headers=None, user_agent=None, download_throughput=-1, stealth=False + ): headers = extra_headers or {} - headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch + headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( - method='Network.setExtraHTTPHeaders', - params={'headers': headers}) - self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=10) + method="Network.setExtraHTTPHeaders", params={"headers": headers} + ) + self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10) if user_agent: msg_id = self.send_to_chrome( - method='Network.setUserAgentOverride', - params={'userAgent': user_agent}) + method="Network.setUserAgentOverride", params={"userAgent": user_agent} + ) if download_throughput > -1: # traffic shaping already used by SPN2 to aid warcprox resilience # parameter value as bytes/second, or -1 to disable (default) - msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions', - params={'downloadThroughput': download_throughput}) + msg_id = self.send_to_chrome( + method="Network.emulateNetworkConditions", + params={"downloadThroughput": download_throughput}, + ) if stealth: self.websock_thread.expect_result(self._command_id.peek()) - js = brozzler.jinja2_environment().get_template('stealth.js').render() + js = brozzler.jinja2_environment().get_template("stealth.js").render() msg_id = self.send_to_chrome( - method='Page.addScriptToEvaluateOnNewDocument', - params={'source': js}) + method="Page.addScriptToEvaluateOnNewDocument", params={"source": js} + ) self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=10) - + lambda: self.websock_thread.received_result(msg_id), timeout=10 + ) def navigate_to_page(self, page_url, timeout=300): - self.logger.info('navigating to page %s', page_url) + self.logger.info("navigating to page %s", page_url) self.websock_thread.got_page_load_event = None self.websock_thread.page_status = None - self.send_to_chrome(method='Page.navigate', params={'url': page_url}) - self._wait_for( - lambda: self.websock_thread.got_page_load_event, - timeout=timeout) + self.send_to_chrome(method="Page.navigate", params={"url": page_url}) + self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout) def extract_outlinks(self, timeout=60): - self.logger.info('extracting outlinks') + self.logger.info("extracting outlinks") self.websock_thread.expect_result(self._command_id.peek()) - js = brozzler.jinja2_environment().get_template( - 'extract-outlinks.js').render() + js = brozzler.jinja2_environment().get_template("extract-outlinks.js").render() msg_id = self.send_to_chrome( - method='Runtime.evaluate', params={'expression': js}) + method="Runtime.evaluate", params={"expression": js} + ) self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=timeout) + lambda: self.websock_thread.received_result(msg_id), timeout=timeout + ) message = self.websock_thread.pop_result(msg_id) - if ('result' in message and 'result' in message['result'] - and 'value' in message['result']['result']): - if message['result']['result']['value']: + if ( + "result" in message + and "result" in message["result"] + and "value" in message["result"]["result"] + ): + if message["result"]["result"]["value"]: out = [] - for link in message['result']['result']['value'].split('\n'): + for link in message["result"]["result"]["value"].split("\n"): try: out.append(str(urlcanon.whatwg(link))) except AddressValueError: - self.logger.warning('skip invalid outlink: %s', link) + self.logger.warning("skip invalid outlink: %s", link) return frozenset(out) else: # no links found return frozenset() else: self.logger.error( - 'problem extracting outlinks, result message: %s', message) + "problem extracting outlinks, result message: %s", message + ) return frozenset() def screenshot(self, full_page=False, timeout=45): @@ -657,121 +720,141 @@ def screenshot(self, full_page=False, timeout=45): inspiration: https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898 """ - self.logger.info('taking screenshot') + self.logger.info("taking screenshot") if full_page: self.websock_thread.expect_result(self._command_id.peek()) - msg_id = self.send_to_chrome(method='Page.getLayoutMetrics') + msg_id = self.send_to_chrome(method="Page.getLayoutMetrics") self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=timeout) + lambda: self.websock_thread.received_result(msg_id), timeout=timeout + ) message = self.websock_thread.pop_result(msg_id) - width = message['result']['contentSize']['width'] - height = message['result']['contentSize']['height'] + width = message["result"]["contentSize"]["width"] + height = message["result"]["contentSize"]["height"] clip = dict(x=0, y=0, width=width, height=height, scale=1) deviceScaleFactor = 1 - screenOrientation = {'angle': 0, 'type': 'portraitPrimary'} + screenOrientation = {"angle": 0, "type": "portraitPrimary"} self.send_to_chrome( - method='Emulation.setDeviceMetricsOverride', - params=dict(mobile=False, width=width, height=height, - deviceScaleFactor=deviceScaleFactor, - screenOrientation=screenOrientation) - ) - capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip} + method="Emulation.setDeviceMetricsOverride", + params=dict( + mobile=False, + width=width, + height=height, + deviceScaleFactor=deviceScaleFactor, + screenOrientation=screenOrientation, + ), + ) + capture_params = {"format": "jpeg", "quality": 95, "clip": clip} else: - capture_params = {'format': 'jpeg', 'quality': 95} + capture_params = {"format": "jpeg", "quality": 95} self.websock_thread.expect_result(self._command_id.peek()) - msg_id = self.send_to_chrome(method='Page.captureScreenshot', - params=capture_params) + msg_id = self.send_to_chrome( + method="Page.captureScreenshot", params=capture_params + ) self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=timeout) + lambda: self.websock_thread.received_result(msg_id), timeout=timeout + ) message = self.websock_thread.pop_result(msg_id) - jpeg_bytes = base64.b64decode(message['result']['data']) + jpeg_bytes = base64.b64decode(message["result"]["data"]) return jpeg_bytes def url(self, timeout=30): - ''' + """ Returns value of document.URL from the browser. - ''' + """ self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( - method='Runtime.evaluate', - params={'expression': 'document.URL'}) + method="Runtime.evaluate", params={"expression": "document.URL"} + ) self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=timeout) + lambda: self.websock_thread.received_result(msg_id), timeout=timeout + ) message = self.websock_thread.pop_result(msg_id) - return message['result']['result']['value'] + return message["result"]["result"]["value"] def run_behavior(self, behavior_script, timeout=900): self.send_to_chrome( - method='Runtime.evaluate', suppress_logging=True, - params={'expression': behavior_script}) + method="Runtime.evaluate", + suppress_logging=True, + params={"expression": behavior_script}, + ) check_interval = min(timeout, 7) start = time.time() while True: elapsed = time.time() - start if elapsed > timeout: - logging.info( - 'behavior reached hard timeout after %.1fs', elapsed) + logging.info("behavior reached hard timeout after %.1fs", elapsed) return brozzler.sleep(check_interval) self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( - method='Runtime.evaluate', suppress_logging=True, - params={'expression': 'umbraBehaviorFinished()'}) + method="Runtime.evaluate", + suppress_logging=True, + params={"expression": "umbraBehaviorFinished()"}, + ) try: self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=5) + lambda: self.websock_thread.received_result(msg_id), timeout=5 + ) msg = self.websock_thread.pop_result(msg_id) - if (msg and 'result' in msg - and not ('exceptionDetails' in msg['result']) - and not ('wasThrown' in msg['result'] - and msg['result']['wasThrown']) - and 'result' in msg['result'] - and type(msg['result']['result']['value']) == bool - and msg['result']['result']['value']): - self.logger.info('behavior decided it has finished') + if ( + msg + and "result" in msg + and not ("exceptionDetails" in msg["result"]) + and not ( + "wasThrown" in msg["result"] and msg["result"]["wasThrown"] + ) + and "result" in msg["result"] + and type(msg["result"]["result"]["value"]) == bool + and msg["result"]["result"]["value"] + ): + self.logger.info("behavior decided it has finished") return except BrowsingTimeout: pass def try_login(self, username, password, timeout=300): - try_login_js = brozzler.jinja2_environment().get_template( - 'try-login.js.j2').render(username=username, password=password) + try_login_js = ( + brozzler.jinja2_environment() + .get_template("try-login.js.j2") + .render(username=username, password=password) + ) self.websock_thread.got_page_load_event = None self.send_to_chrome( - method='Runtime.evaluate', suppress_logging=True, - params={'expression': try_login_js}) + method="Runtime.evaluate", + suppress_logging=True, + params={"expression": try_login_js}, + ) # wait for tryLogin to finish trying (should be very very quick) start = time.time() while True: self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( - method='Runtime.evaluate', - params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) + method="Runtime.evaluate", + params={ + "expression": 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }' + }, + ) try: self._wait_for( - lambda: self.websock_thread.received_result(msg_id), - timeout=5) + lambda: self.websock_thread.received_result(msg_id), timeout=5 + ) msg = self.websock_thread.pop_result(msg_id) - if (msg and 'result' in msg - and 'result' in msg['result']): - result = msg['result']['result']['value'] - if result == 'login-form-not-found': + if msg and "result" in msg and "result" in msg["result"]: + result = msg["result"]["result"]["value"] + if result == "login-form-not-found": # we're done return - elif result in ('submitted-form', 'maybe-submitted-form'): + elif result in ("submitted-form", "maybe-submitted-form"): # wait for page load event below self.logger.info( - 'submitted a login form, waiting for another ' - 'page load event') + "submitted a login form, waiting for another " + "page load event" + ) break # else try again to get __brzl_tryLoginState @@ -780,23 +863,23 @@ def try_login(self, username, password, timeout=300): if time.time() - start > 30: raise BrowsingException( - 'timed out trying to check if tryLogin finished') + "timed out trying to check if tryLogin finished" + ) # if we get here, we submitted a form, now we wait for another page # load event - self._wait_for( - lambda: self.websock_thread.got_page_load_event, - timeout=timeout) + self._wait_for(lambda: self.websock_thread.got_page_load_event, timeout=timeout) + class Counter: def __init__(self): self.next_value = 0 + def __next__(self): try: return self.next_value finally: self.next_value += 1 + def peek(self): return self.next_value - - diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 007e8a26..1b52bbf6 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -1,4 +1,4 @@ -''' +""" brozzler/chrome.py - manages the chrome/chromium browser for brozzler Copyright (C) 2014-2023 Internet Archive @@ -14,7 +14,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import urllib.request @@ -31,39 +31,43 @@ import tempfile import sys + def check_version(chrome_exe): - ''' + """ Raises SystemExit if `chrome_exe` is not a supported browser version. Must run in the main thread to have the desired effect. - ''' + """ # mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version - # Google Chrome 64.0.3282.140 + # Google Chrome 64.0.3282.140 # mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version # Google Chrome 66.0.3341.0 canary # linux$ chromium-browser --version # Using PPAPI flash. # --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version= # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04 - cmd = [chrome_exe, '--version'] + cmd = [chrome_exe, "--version"] out = subprocess.check_output(cmd, timeout=60) - m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out) + m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out) if not m: sys.exit( - 'unable to parse browser version from output of ' - '%r: %r' % (subprocess.list2cmdline(cmd), out)) + "unable to parse browser version from output of " + "%r: %r" % (subprocess.list2cmdline(cmd), out) + ) version_str = m.group(2).decode() - major_version = int(version_str.split('.')[0]) + major_version = int(version_str.split(".")[0]) if major_version < 64: - sys.exit('brozzler requires chrome/chromium version 64 or ' - 'later but %s reports version %s' % ( - chrome_exe, version_str)) + sys.exit( + "brozzler requires chrome/chromium version 64 or " + "later but %s reports version %s" % (chrome_exe, version_str) + ) + class Chrome: - logger = logging.getLogger(__module__ + '.' + __qualname__) + logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): - ''' + """ Initializes instance of this class. Doesn't start the browser, start() does that. @@ -73,7 +77,7 @@ def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): port: chrome debugging protocol port (default 9222) ignore_cert_errors: configure chrome to accept all certs (default False) - ''' + """ self.port = port self.chrome_exe = chrome_exe self.ignore_cert_errors = ignore_cert_errors @@ -81,63 +85,72 @@ def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): self.chrome_process = None def __enter__(self): - ''' + """ Returns websocket url to chrome window with about:blank loaded. - ''' + """ return self.start() def __exit__(self, *args): self.stop() def _init_cookie_db(self, cookie_db): - cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') - cookie_location = os.path.join(cookie_dir, 'Cookies') - self.logger.debug('cookie DB provided, writing to %s', cookie_location) + cookie_dir = os.path.join(self._chrome_user_data_dir, "Default") + cookie_location = os.path.join(cookie_dir, "Cookies") + self.logger.debug("cookie DB provided, writing to %s", cookie_location) os.makedirs(cookie_dir, exist_ok=True) try: - with open(cookie_location, 'wb') as cookie_file: + with open(cookie_location, "wb") as cookie_file: cookie_file.write(cookie_db) except OSError: self.logger.error( - 'exception writing cookie file at %s', - cookie_location, exc_info=True) + "exception writing cookie file at %s", cookie_location, exc_info=True + ) def persist_and_read_cookie_db(self): - cookie_location = os.path.join( - self._chrome_user_data_dir, 'Default', 'Cookies') + cookie_location = os.path.join(self._chrome_user_data_dir, "Default", "Cookies") self.logger.debug( - 'marking cookies persistent then reading file into memory: %s', - cookie_location) + "marking cookies persistent then reading file into memory: %s", + cookie_location, + ) try: with sqlite3.connect(cookie_location) as conn: cur = conn.cursor() - cur.execute('UPDATE cookies SET is_persistent = 1') + cur.execute("UPDATE cookies SET is_persistent = 1") except sqlite3.Error: try: # db schema changed around version 66, this is the old schema with sqlite3.connect(cookie_location) as conn: cur = conn.cursor() - cur.execute('UPDATE cookies SET persistent = 1') + cur.execute("UPDATE cookies SET persistent = 1") except sqlite3.Error: self.logger.error( - 'exception updating cookie DB %s', cookie_location, - exc_info=True) + "exception updating cookie DB %s", cookie_location, exc_info=True + ) cookie_db = None try: - with open(cookie_location, 'rb') as cookie_file: + with open(cookie_location, "rb") as cookie_file: cookie_db = cookie_file.read() except OSError: self.logger.error( - 'exception reading from cookie DB file %s', - cookie_location, exc_info=True) + "exception reading from cookie DB file %s", + cookie_location, + exc_info=True, + ) return cookie_db - def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, - disk_cache_size=None, websocket_timeout=60, - window_height=900, window_width=1400): - ''' + def start( + self, + proxy=None, + cookie_db=None, + disk_cache_dir=None, + disk_cache_size=None, + websocket_timeout=60, + window_height=900, + window_width=1400, + ): + """ Starts chrome/chromium process. Args: @@ -154,103 +167,126 @@ def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, window_height, window_width: window height and width, in pixels Returns: websocket url to chrome window with about:blank loaded - ''' + """ # these can raise exceptions self._home_tmpdir = tempfile.TemporaryDirectory() self._chrome_user_data_dir = os.path.join( - self._home_tmpdir.name, 'chrome-user-data') + self._home_tmpdir.name, "chrome-user-data" + ) if cookie_db: self._init_cookie_db(cookie_db) self._shutdown.clear() new_env = os.environ.copy() - new_env['HOME'] = self._home_tmpdir.name + new_env["HOME"] = self._home_tmpdir.name chrome_args = [ - self.chrome_exe, - '-v', - '--headless', - '--remote-debugging-port=%s' % self.port, - '--use-mock-keychain', # mac thing - '--user-data-dir=%s' % self._chrome_user_data_dir, - '--disable-background-networking', '--disable-breakpad', - '--disable-renderer-backgrounding', '--disable-hang-monitor', - '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', - f'--window-size={window_width},{window_height}', - '--no-default-browser-check', - '--disable-first-run-ui', '--no-first-run', - '--homepage=about:blank', '--disable-direct-npapi-requests', - '--disable-web-security', '--disable-notifications', - '--disable-extensions', '--disable-save-password-bubble', - '--disable-sync'] - - extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') + self.chrome_exe, + "-v", + "--headless", + "--remote-debugging-port=%s" % self.port, + "--use-mock-keychain", # mac thing + "--user-data-dir=%s" % self._chrome_user_data_dir, + "--disable-background-networking", + "--disable-breakpad", + "--disable-renderer-backgrounding", + "--disable-hang-monitor", + "--disable-background-timer-throttling", + "--mute-audio", + "--disable-web-sockets", + f"--window-size={window_width},{window_height}", + "--no-default-browser-check", + "--disable-first-run-ui", + "--no-first-run", + "--homepage=about:blank", + "--disable-direct-npapi-requests", + "--disable-web-security", + "--disable-notifications", + "--disable-extensions", + "--disable-save-password-bubble", + "--disable-sync", + ] + + extra_chrome_args = os.environ.get("BROZZLER_EXTRA_CHROME_ARGS") if extra_chrome_args: chrome_args.extend(extra_chrome_args.split()) if disk_cache_dir: - chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) + chrome_args.append("--disk-cache-dir=%s" % disk_cache_dir) if disk_cache_size: - chrome_args.append('--disk-cache-size=%s' % disk_cache_size) + chrome_args.append("--disk-cache-size=%s" % disk_cache_size) if self.ignore_cert_errors: - chrome_args.append('--ignore-certificate-errors') + chrome_args.append("--ignore-certificate-errors") if proxy: - chrome_args.append('--proxy-server=%s' % proxy) - chrome_args.append('about:blank') - self.logger.info('running: %r', subprocess.list2cmdline(chrome_args)) + chrome_args.append("--proxy-server=%s" % proxy) + chrome_args.append("about:blank") + self.logger.info("running: %r", subprocess.list2cmdline(chrome_args)) # start_new_session - new process group so we can kill the whole group self.chrome_process = subprocess.Popen( - chrome_args, env=new_env, start_new_session=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) + chrome_args, + env=new_env, + start_new_session=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + ) self._out_reader_thread = threading.Thread( - target=self._read_stderr_stdout, - name='ChromeOutReaderThread:%s' % self.port, daemon=True) + target=self._read_stderr_stdout, + name="ChromeOutReaderThread:%s" % self.port, + daemon=True, + ) self._out_reader_thread.start() - self.logger.info('chrome running, pid %s' % self.chrome_process.pid) + self.logger.info("chrome running, pid %s" % self.chrome_process.pid) return self._websocket_url(timeout_sec=websocket_timeout) - def _websocket_url(self, timeout_sec = 60): - json_url = 'http://localhost:%s/json' % self.port + def _websocket_url(self, timeout_sec=60): + json_url = "http://localhost:%s/json" % self.port # make this a member variable so that kill -QUIT reports it self._start = time.time() self._last_warning = self._start while True: try: raw_json = urllib.request.urlopen(json_url, timeout=30).read() - all_debug_info = json.loads(raw_json.decode('utf-8')) - debug_info = [x for x in all_debug_info - if x['url'] == 'about:blank'] + all_debug_info = json.loads(raw_json.decode("utf-8")) + debug_info = [x for x in all_debug_info if x["url"] == "about:blank"] - if debug_info and 'webSocketDebuggerUrl' in debug_info[0]: - self.logger.debug('%s returned %s', json_url, raw_json) - url = debug_info[0]['webSocketDebuggerUrl'] + if debug_info and "webSocketDebuggerUrl" in debug_info[0]: + self.logger.debug("%s returned %s", json_url, raw_json) + url = debug_info[0]["webSocketDebuggerUrl"] self.logger.info( - 'got chrome window websocket debug url %s from %s', - url, json_url) + "got chrome window websocket debug url %s from %s", + url, + json_url, + ) return url except brozzler.ShutdownRequested: raise except Exception as e: if time.time() - self._last_warning > 30: self.logger.warning( - 'problem with %s (will keep trying until timeout ' - 'of %d seconds): %s', json_url, timeout_sec, e) + "problem with %s (will keep trying until timeout " + "of %d seconds): %s", + json_url, + timeout_sec, + e, + ) self._last_warning = time.time() finally: e = None if self.chrome_process: if time.time() - self._start > timeout_sec: e = Exception( - 'killing chrome, failed to retrieve %s after ' - '%s seconds' % ( - json_url, time.time() - self._start)) + "killing chrome, failed to retrieve %s after " + "%s seconds" % (json_url, time.time() - self._start) + ) elif self.chrome_process.poll() is not None: e = Exception( - 'chrome process died with status %s' % self.chrome_process.poll()) + "chrome process died with status %s" + % self.chrome_process.poll() + ) else: time.sleep(0.5) else: - e = Exception('??? self.chrome_process is not set ???') + e = Exception("??? self.chrome_process is not set ???") if e: self.stop() raise e @@ -258,11 +294,13 @@ def _websocket_url(self, timeout_sec = 60): def _read_stderr_stdout(self): # XXX select doesn't work on windows def readline_nonblock(f): - buf = b'' + buf = b"" try: - while not self._shutdown.is_set() and ( - len(buf) == 0 or buf[-1] != 0xa) and select.select( - [f],[],[],0.5)[0]: + while ( + not self._shutdown.is_set() + and (len(buf) == 0 or buf[-1] != 0xA) + and select.select([f], [], [], 0.5)[0] + ): buf += f.read(1) except (ValueError, OSError): # When the chrome process crashes, stdout & stderr are closed @@ -276,16 +314,16 @@ def readline_nonblock(f): buf = readline_nonblock(self.chrome_process.stdout) if buf: self.logger.trace( - 'chrome pid %s STDOUT %s', - self.chrome_process.pid, buf) + "chrome pid %s STDOUT %s", self.chrome_process.pid, buf + ) buf = readline_nonblock(self.chrome_process.stderr) if buf: self.logger.trace( - 'chrome pid %s STDERR %s', - self.chrome_process.pid, buf) + "chrome pid %s STDERR %s", self.chrome_process.pid, buf + ) except: - self.logger.error('unexpected exception', exc_info=True) + self.logger.error("unexpected exception", exc_info=True) def stop(self): if not self.chrome_process or self._shutdown.is_set(): @@ -294,8 +332,7 @@ def stop(self): timeout_sec = 300 if self.chrome_process.poll() is None: - self.logger.info( - 'terminating chrome pgid %s', self.chrome_process.pid) + self.logger.info("terminating chrome pgid %s", self.chrome_process.pid) os.killpg(self.chrome_process.pid, signal.SIGTERM) t0 = time.time() @@ -306,12 +343,14 @@ def stop(self): if status is not None: if status == 0: self.logger.info( - 'chrome pid %s exited normally', - self.chrome_process.pid) + "chrome pid %s exited normally", self.chrome_process.pid + ) else: self.logger.warning( - 'chrome pid %s exited with nonzero status %s', - self.chrome_process.pid, status) + "chrome pid %s exited with nonzero status %s", + self.chrome_process.pid, + status, + ) # XXX I would like to forcefully kill the process group # here to guarantee no orphaned chromium subprocesses hang @@ -321,14 +360,18 @@ def stop(self): time.sleep(0.5) self.logger.warning( - 'chrome pid %s still alive %.1f seconds after sending ' - 'SIGTERM, sending SIGKILL', self.chrome_process.pid, - time.time() - t0) + "chrome pid %s still alive %.1f seconds after sending " + "SIGTERM, sending SIGKILL", + self.chrome_process.pid, + time.time() - t0, + ) os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() self.logger.warning( - 'chrome pid %s reaped (status=%s) after killing with ' - 'SIGKILL', self.chrome_process.pid, status) + "chrome pid %s reaped (status=%s) after killing with " "SIGKILL", + self.chrome_process.pid, + status, + ) finally: self.chrome_process.stdout.close() @@ -337,8 +380,7 @@ def stop(self): self._home_tmpdir.cleanup() except: self.logger.error( - 'exception deleting %s', self._home_tmpdir, - exc_info=True) + "exception deleting %s", self._home_tmpdir, exc_info=True + ) self._out_reader_thread.join() self.chrome_process = None - diff --git a/brozzler/cli.py b/brozzler/cli.py index 534dbe17..75230086 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" brozzler/cli.py - brozzler command line executables Copyright (C) 2014-2023 Internet Archive @@ -15,7 +15,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import argparse import brozzler @@ -41,64 +41,97 @@ r = rdb.RethinkDB() + def add_common_options(arg_parser, argv=None): argv = argv or sys.argv arg_parser.add_argument( - '-q', '--quiet', dest='log_level', action='store_const', - default=logging.INFO, const=logging.NOTICE, help='quiet logging') + "-q", + "--quiet", + dest="log_level", + action="store_const", + default=logging.INFO, + const=logging.NOTICE, + help="quiet logging", + ) arg_parser.add_argument( - '-v', '--verbose', dest='log_level', action='store_const', - default=logging.INFO, const=logging.DEBUG, help=( - 'verbose logging')) + "-v", + "--verbose", + dest="log_level", + action="store_const", + default=logging.INFO, + const=logging.DEBUG, + help=("verbose logging"), + ) arg_parser.add_argument( - '--trace', dest='log_level', action='store_const', - default=logging.INFO, const=logging.TRACE, help=( - 'very verbose logging')) + "--trace", + dest="log_level", + action="store_const", + default=logging.INFO, + const=logging.TRACE, + help=("very verbose logging"), + ) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) arg_parser.add_argument( - '--version', action='version', - version='brozzler %s - %s' % ( - brozzler.__version__, os.path.basename(argv[0]))) + "--version", + action="version", + version="brozzler %s - %s" % (brozzler.__version__, os.path.basename(argv[0])), + ) + def add_rethinkdb_options(arg_parser): arg_parser.add_argument( - '--rethinkdb-servers', dest='rethinkdb_servers', - default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'), - help=( - 'rethinkdb servers, e.g. ' - 'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the ' - 'value of environment variable BROZZLER_RETHINKDB_SERVERS)')) + "--rethinkdb-servers", + dest="rethinkdb_servers", + default=os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost"), + help=( + "rethinkdb servers, e.g. " + "db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the " + "value of environment variable BROZZLER_RETHINKDB_SERVERS)" + ), + ) arg_parser.add_argument( - '--rethinkdb-db', dest='rethinkdb_db', - default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'), - help=( - 'rethinkdb database name (default is the value of environment ' - 'variable BROZZLER_RETHINKDB_DB)')) + "--rethinkdb-db", + dest="rethinkdb_db", + default=os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"), + help=( + "rethinkdb database name (default is the value of environment " + "variable BROZZLER_RETHINKDB_DB)" + ), + ) + def rethinker(args): - servers = args.rethinkdb_servers or 'localhost' - db = args.rethinkdb_db or os.environ.get( - 'BROZZLER_RETHINKDB_DB') or 'brozzler' - return doublethink.Rethinker(servers.split(','), db) + servers = args.rethinkdb_servers or "localhost" + db = args.rethinkdb_db or os.environ.get("BROZZLER_RETHINKDB_DB") or "brozzler" + return doublethink.Rethinker(servers.split(","), db) + def configure_logging(args): logging.basicConfig( - stream=sys.stderr, level=args.log_level, format=( - '%(asctime)s %(process)d %(levelname)s %(threadName)s ' - '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) - logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN) + stream=sys.stderr, + level=args.log_level, + format=( + "%(asctime)s %(process)d %(levelname)s %(threadName)s " + "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s" + ), + ) + logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter( - 'ignore', category=requests.packages.urllib3.exceptions.InsecureRequestWarning) + "ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning + ) warnings.simplefilter( - 'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) + "ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning + ) + def suggest_default_chrome_exe(): # mac os x application executable paths for path in [ - '/Applications/Chromium.app/Contents/MacOS/Chromium', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome']: + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ]: if os.path.exists(path): return path @@ -107,79 +140,113 @@ def suggest_default_chrome_exe(): # google chrome executable names taken from these packages: # http://www.ubuntuupdates.org/ppa/google_chrome for exe in [ - 'chromium-browser', 'chromium', 'google-chrome', - 'google-chrome-stable', 'google-chrome-beta', - 'google-chrome-unstable']: + "chromium-browser", + "chromium", + "google-chrome", + "google-chrome-stable", + "google-chrome-beta", + "google-chrome-unstable", + ]: if shutil.which(exe): return exe - return 'chromium-browser' + return "chromium-browser" + -class BetterArgumentDefaultsHelpFormatter( - argparse.ArgumentDefaultsHelpFormatter): - ''' +class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): + """ Like argparse.ArgumentDefaultsHelpFormatter but omits the default value for arguments with action='store_const'. - ''' + """ + def _get_help_string(self, action): if isinstance(action, argparse._StoreConstAction): return action.help else: return super()._get_help_string(action) + def brozzle_page(argv=None): - ''' + """ Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. - ''' + """ argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - description='brozzle-page - brozzle a single page', - formatter_class=BetterArgumentDefaultsHelpFormatter) - arg_parser.add_argument('url', metavar='URL', help='page url') + prog=os.path.basename(argv[0]), + description="brozzle-page - brozzle a single page", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) + arg_parser.add_argument("url", metavar="URL", help="page url") arg_parser.add_argument( - '-e', '--chrome-exe', dest='chrome_exe', - default=suggest_default_chrome_exe(), - help='executable to use to invoke chrome') + "-e", + "--chrome-exe", + dest="chrome_exe", + default=suggest_default_chrome_exe(), + help="executable to use to invoke chrome", + ) arg_parser.add_argument( - '--behavior-parameters', dest='behavior_parameters', - default=None, help=( - 'json blob of parameters to populate the javascript behavior ' - 'template, e.g. {"parameter_username":"x",' - '"parameter_password":"y"}')) + "--behavior-parameters", + dest="behavior_parameters", + default=None, + help=( + "json blob of parameters to populate the javascript behavior " + 'template, e.g. {"parameter_username":"x",' + '"parameter_password":"y"}' + ), + ) arg_parser.add_argument( - '--username', dest='username', default=None, - help='use this username to try to log in if a login form is found') + "--username", + dest="username", + default=None, + help="use this username to try to log in if a login form is found", + ) arg_parser.add_argument( - '--password', dest='password', default=None, - help='use this password to try to log in if a login form is found') + "--password", + dest="password", + default=None, + help="use this password to try to log in if a login form is found", + ) + arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy") arg_parser.add_argument( - '--proxy', dest='proxy', default=None, help='http proxy') + "--browser_throughput", + type=int, + dest="download_throughput", + default=-1, + help="Chrome DevTools downloadThroughput for Network.emulateNetworkConditions", + ) arg_parser.add_argument( - '--browser_throughput', type=int, dest='download_throughput', default=-1, - help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + "--browser_window_height", + type=int, + dest="window_height", + default=900, + help="browser window height in pixels", + ) arg_parser.add_argument( - '--browser_window_height', type=int, dest='window_height', default=900, - help='browser window height in pixels') + "--browser_window_width", + type=int, + dest="window_width", + default=1400, + help="browser window width in pixels", + ) arg_parser.add_argument( - '--browser_window_width', type=int, dest='window_width', default=1400, - help='browser window width in pixels') + "--stealth", + dest="stealth", + action="store_true", + help="Try to avoid web bot detection", + ) arg_parser.add_argument( - '--stealth', dest='stealth', action='store_true', - help='Try to avoid web bot detection') + "--screenshot-full-page", dest="screenshot_full_page", action="store_true" + ) arg_parser.add_argument( - '--screenshot-full-page', dest='screenshot_full_page', - action='store_true') + "--skip-extract-outlinks", dest="skip_extract_outlinks", action="store_true" + ) arg_parser.add_argument( - '--skip-extract-outlinks', dest='skip_extract_outlinks', - action='store_true') + "--skip-visit-hashtags", dest="skip_visit_hashtags", action="store_true" + ) arg_parser.add_argument( - '--skip-visit-hashtags', dest='skip_visit_hashtags', - action='store_true') - arg_parser.add_argument( - '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true') - arg_parser.add_argument( - '--simpler404', dest='simpler404', action='store_true') + "--skip-youtube-dl", dest="skip_youtube_dl", action="store_true" + ) + arg_parser.add_argument("--simpler404", dest="simpler404", action="store_true") add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -189,59 +256,81 @@ def brozzle_page(argv=None): behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) - site = brozzler.Site(None, { - 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, - 'username': args.username, 'password': args.password}) - page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) + site = brozzler.Site( + None, + { + "id": -1, + "seed": args.url, + "behavior_parameters": behavior_parameters, + "username": args.username, + "password": args.password, + }, + ) + page = brozzler.Page(None, {"url": args.url, "site_id": site.id}) worker = brozzler.BrozzlerWorker( - frontier=None, proxy=args.proxy, - skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags, - skip_youtube_dl=args.skip_youtube_dl, - simpler404=args.simpler404, - screenshot_full_page=args.screenshot_full_page, - download_throughput=args.download_throughput, - window_height=args.window_height, - window_width=args.window_width, - stealth=args.stealth) + frontier=None, + proxy=args.proxy, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl, + simpler404=args.simpler404, + screenshot_full_page=args.screenshot_full_page, + download_throughput=args.download_throughput, + window_height=args.window_height, + window_width=args.window_width, + stealth=args.stealth, + ) def on_screenshot(screenshot_jpeg): OK_CHARS = string.ascii_letters + string.digits - filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format( - ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), - datetime.datetime.now()) - with open(filename, 'wb') as f: + filename = "/tmp/{}-{:%Y%m%d%H%M%S}.jpg".format( + "".join(ch if ch in OK_CHARS else "_" for ch in args.url), + datetime.datetime.now(), + ) + with open(filename, "wb") as f: f.write(screenshot_jpeg) - logging.info('wrote screenshot to %s', filename) + logging.info("wrote screenshot to %s", filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: - browser.start(proxy=args.proxy, window_height=args.window_height, window_width=args.window_width) + browser.start( + proxy=args.proxy, + window_height=args.window_height, + window_width=args.window_width, + ) outlinks = worker.brozzle_page( - browser, site, page, on_screenshot=on_screenshot, - enable_youtube_dl=not args.skip_youtube_dl) - logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) + browser, + site, + page, + on_screenshot=on_screenshot, + enable_youtube_dl=not args.skip_youtube_dl, + ) + logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks))) except brozzler.ReachedLimit as e: - logging.error('reached limit %s', e) + logging.error("reached limit %s", e) except brozzler.PageInterstitialShown as e: - logging.error('page interstitial shown %s', e) + logging.error("page interstitial shown %s", e) finally: browser.stop() + def brozzler_new_job(argv=None): - ''' + """ Command line utility entry point for queuing a new brozzler job. Takes a yaml brozzler job configuration file, creates job, sites, and pages objects in rethinkdb, which brozzler-workers will look at and start crawling. - ''' + """ argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - description='brozzler-new-job - queue new job with brozzler', - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + description="brozzler-new-job - queue new job with brozzler", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) arg_parser.add_argument( - 'job_conf_file', metavar='JOB_CONF_FILE', - help='brozzler job configuration file in yaml') + "job_conf_file", + metavar="JOB_CONF_FILE", + help="brozzler job configuration file in yaml", + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -253,112 +342,181 @@ def brozzler_new_job(argv=None): try: brozzler.new_job_file(frontier, args.job_conf_file) except brozzler.InvalidJobConf as e: - print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr) - print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr) + print( + "brozzler-new-job: invalid job file:", args.job_conf_file, file=sys.stderr + ) + print( + " " + yaml.dump(e.errors).rstrip().replace("\n", "\n "), file=sys.stderr + ) sys.exit(1) + def brozzler_new_site(argv=None): - ''' + """ Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. - ''' + """ argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - description='brozzler-new-site - register site to brozzle', - formatter_class=BetterArgumentDefaultsHelpFormatter) - arg_parser.add_argument('seed', metavar='SEED', help='seed url') + prog=os.path.basename(argv[0]), + description="brozzler-new-site - register site to brozzle", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) + arg_parser.add_argument("seed", metavar="SEED", help="seed url") add_rethinkdb_options(arg_parser) arg_parser.add_argument( - '--time-limit', dest='time_limit', default=None, - help='time limit in seconds for this site') + "--time-limit", + dest="time_limit", + default=None, + help="time limit in seconds for this site", + ) arg_parser.add_argument( - '--ignore-robots', dest='ignore_robots', action='store_true', - help='ignore robots.txt for this site') + "--ignore-robots", + dest="ignore_robots", + action="store_true", + help="ignore robots.txt for this site", + ) arg_parser.add_argument( - '--warcprox-meta', dest='warcprox_meta', - help=( - 'Warcprox-Meta http request header to send with each request; ' - 'must be a json blob, ignored unless warcprox features are ' - 'enabled')) + "--warcprox-meta", + dest="warcprox_meta", + help=( + "Warcprox-Meta http request header to send with each request; " + "must be a json blob, ignored unless warcprox features are " + "enabled" + ), + ) arg_parser.add_argument( - '--behavior-parameters', dest='behavior_parameters', - default=None, help=( - 'json blob of parameters to populate the javascript behavior ' - 'template, e.g. {"parameter_username":"x",' - '"parameter_password":"y"}')) + "--behavior-parameters", + dest="behavior_parameters", + default=None, + help=( + "json blob of parameters to populate the javascript behavior " + 'template, e.g. {"parameter_username":"x",' + '"parameter_password":"y"}' + ), + ) arg_parser.add_argument( - '--username', dest='username', default=None, - help='use this username to try to log in if a login form is found') + "--username", + dest="username", + default=None, + help="use this username to try to log in if a login form is found", + ) arg_parser.add_argument( - '--password', dest='password', default=None, - help='use this password to try to log in if a login form is found') + "--password", + dest="password", + default=None, + help="use this password to try to log in if a login form is found", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) - site = brozzler.Site(rr, { - 'seed': args.seed, - 'time_limit': int(args.time_limit) if args.time_limit else None, - 'ignore_robots': args.ignore_robots, - 'warcprox_meta': json.loads( - args.warcprox_meta) if args.warcprox_meta else None, - 'behavior_parameters': json.loads( - args.behavior_parameters) if args.behavior_parameters else None, - 'username': args.username, - 'password': args.password}) + site = brozzler.Site( + rr, + { + "seed": args.seed, + "time_limit": int(args.time_limit) if args.time_limit else None, + "ignore_robots": args.ignore_robots, + "warcprox_meta": ( + json.loads(args.warcprox_meta) if args.warcprox_meta else None + ), + "behavior_parameters": ( + json.loads(args.behavior_parameters) + if args.behavior_parameters + else None + ), + "username": args.username, + "password": args.password, + }, + ) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) + def brozzler_worker(argv=None): - ''' + """ Main entry point for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. - ''' + """ argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) add_rethinkdb_options(arg_parser) arg_parser.add_argument( - '-e', '--chrome-exe', dest='chrome_exe', - default=suggest_default_chrome_exe(), - help='executable to use to invoke chrome') + "-e", + "--chrome-exe", + dest="chrome_exe", + default=suggest_default_chrome_exe(), + help="executable to use to invoke chrome", + ) arg_parser.add_argument( - '-n', '--max-browsers', dest='max_browsers', default='1', - help='max number of chrome instances simultaneously browsing pages') + "-n", + "--max-browsers", + dest="max_browsers", + default="1", + help="max number of chrome instances simultaneously browsing pages", + ) + arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy") arg_parser.add_argument( - '--proxy', dest='proxy', default=None, help='http proxy') + "--browser_throughput", + type=int, + dest="download_throughput", + default=-1, + help="Chrome DevTools downloadThroughput for Network.emulateNetworkConditions", + ) arg_parser.add_argument( - '--browser_throughput', type=int, dest='download_throughput', default=-1, - help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + "--browser_window_height", + type=int, + dest="window_height", + default=900, + help="browser window height in pixels", + ) arg_parser.add_argument( - '--browser_window_height', type=int, dest='window_height', default=900, - help='browser window height in pixels') + "--browser_window_width", + type=int, + dest="window_width", + default=1400, + help="browser window width in pixels", + ) arg_parser.add_argument( - '--browser_window_width', type=int, dest='window_width', default=1400, - help='browser window width in pixels') + "--warcprox-auto", + dest="warcprox_auto", + action="store_true", + help=( + "when needed, choose an available instance of warcprox from " + "the rethinkdb service registry" + ), + ) arg_parser.add_argument( - '--warcprox-auto', dest='warcprox_auto', action='store_true', - help=( - 'when needed, choose an available instance of warcprox from ' - 'the rethinkdb service registry')) + "--skip-extract-outlinks", + dest="skip_extract_outlinks", + action="store_true", + help=argparse.SUPPRESS, + ) arg_parser.add_argument( - '--skip-extract-outlinks', dest='skip_extract_outlinks', - action='store_true', help=argparse.SUPPRESS) + "--skip-visit-hashtags", + dest="skip_visit_hashtags", + action="store_true", + help=argparse.SUPPRESS, + ) arg_parser.add_argument( - '--skip-visit-hashtags', dest='skip_visit_hashtags', - action='store_true', help=argparse.SUPPRESS) + "--skip-youtube-dl", + dest="skip_youtube_dl", + action="store_true", + help=argparse.SUPPRESS, + ) arg_parser.add_argument( - '--skip-youtube-dl', dest='skip_youtube_dl', - action='store_true', help=argparse.SUPPRESS) - arg_parser.add_argument( - '--stealth', dest='stealth', action='store_true', - help='Try to avoid web bot detection') + "--stealth", + dest="stealth", + action="store_true", + help="Try to avoid web bot detection", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -375,14 +533,14 @@ def dump_state(signum, frame): if threads[ident]: state_strs.append(str(threads[ident])) else: - state_strs.append('' % ident) + state_strs.append("" % ident) stack = traceback.format_stack(frames[ident]) - state_strs.append(''.join(stack)) + state_strs.append("".join(stack)) logging.info( - 'dumping state (caught signal %s)\n%s' % ( - signum, '\n'.join(state_strs))) + "dumping state (caught signal %s)\n%s" % (signum, "\n".join(state_strs)) + ) except BaseException as e: - logging.error('exception dumping state: %s' % e) + logging.error("exception dumping state: %s" % e) finally: signal.signal(signal.SIGQUIT, dump_state) @@ -390,35 +548,41 @@ def dump_state(signum, frame): frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( - frontier, service_registry, max_browsers=int(args.max_browsers), - chrome_exe=args.chrome_exe, proxy=args.proxy, - warcprox_auto=args.warcprox_auto, - skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags, - skip_youtube_dl=args.skip_youtube_dl, - stealth=args.stealth) + frontier, + service_registry, + max_browsers=int(args.max_browsers), + chrome_exe=args.chrome_exe, + proxy=args.proxy, + warcprox_auto=args.warcprox_auto, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl, + stealth=args.stealth, + ) signal.signal(signal.SIGQUIT, dump_state) - signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) - signal.signal(signal.SIGINT, lambda s,f: worker.stop()) + signal.signal(signal.SIGTERM, lambda s, f: worker.stop()) + signal.signal(signal.SIGINT, lambda s, f: worker.stop()) - th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread') + th = threading.Thread(target=worker.run, name="BrozzlerWorkerThread") th.start() th.join() - logging.info('brozzler-worker is all done, exiting') + logging.info("brozzler-worker is all done, exiting") + def brozzler_ensure_tables(argv=None): - ''' + """ Creates rethinkdb tables if they don't already exist. Brozzler (brozzler-worker, brozzler-new-job, etc) normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. - ''' + """ argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -433,33 +597,37 @@ def brozzler_ensure_tables(argv=None): # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(rr) + class Jsonner(json.JSONEncoder): def default(self, o): if isinstance(o, datetime.datetime): return o.isoformat() elif isinstance(o, bytes): - return base64.b64encode(o).decode('ascii') + return base64.b64encode(o).decode("ascii") else: return json.JSONEncoder.default(self, o) + def brozzler_list_jobs(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) arg_parser.add_argument( - '--yaml', dest='yaml', action='store_true', help=( - 'yaml output (default is json)')) + "--yaml", + dest="yaml", + action="store_true", + help=("yaml output (default is json)"), + ) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( - '--active', dest='active', action='store_true', help=( - 'list active jobs')) + "--active", dest="active", action="store_true", help=("list active jobs") + ) + group.add_argument("--all", dest="all", action="store_true", help=("list all jobs")) group.add_argument( - '--all', dest='all', action='store_true', help=( - 'list all jobs')) - group.add_argument( - '--job', dest='job', metavar='JOB_ID', help=( - 'list only the specified job')) + "--job", dest="job", metavar="JOB_ID", help=("list only the specified job") + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -472,52 +640,60 @@ def brozzler_list_jobs(argv=None): job_id = int(args.job) except ValueError: job_id = args.job - reql = rr.table('jobs').get(job_id) - logging.debug('querying rethinkdb: %s', reql) + reql = rr.table("jobs").get(job_id) + logging.debug("querying rethinkdb: %s", reql) result = reql.run() if result: results = [reql.run()] else: - logging.error('no such job with id %r', job_id) + logging.error("no such job with id %r", job_id) sys.exit(1) else: - reql = rr.table('jobs').order_by('id') + reql = rr.table("jobs").order_by("id") if args.active: - reql = reql.filter({'status': 'ACTIVE'}) - logging.debug('querying rethinkdb: %s', reql) + reql = reql.filter({"status": "ACTIVE"}) + logging.debug("querying rethinkdb: %s", reql) results = reql.run() if args.yaml: yaml.dump_all( - results, stream=sys.stdout, explicit_start=True, - default_flow_style=False) + results, stream=sys.stdout, explicit_start=True, default_flow_style=False + ) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) + def brozzler_list_sites(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) arg_parser.add_argument( - '--yaml', dest='yaml', action='store_true', help=( - 'yaml output (default is json)')) + "--yaml", + dest="yaml", + action="store_true", + help=("yaml output (default is json)"), + ) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( - '--active', dest='active', action='store_true', help=( - 'list all active sites')) + "--active", dest="active", action="store_true", help=("list all active sites") + ) group.add_argument( - '--job', dest='job', metavar='JOB_ID', help=( - 'list sites for a particular job')) + "--job", dest="job", metavar="JOB_ID", help=("list sites for a particular job") + ) group.add_argument( - '--jobless', dest='jobless', action='store_true', help=( - 'list all jobless sites')) + "--jobless", + dest="jobless", + action="store_true", + help=("list all jobless sites"), + ) group.add_argument( - '--site', dest='site', metavar='SITE_ID', help=( - 'list only the specified site')) + "--site", dest="site", metavar="SITE_ID", help=("list only the specified site") + ) group.add_argument( - '--all', dest='all', action='store_true', help=( - 'list all sites')) + "--all", dest="all", action="store_true", help=("list all sites") + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -526,60 +702,76 @@ def brozzler_list_sites(argv=None): rr = rethinker(args) - reql = rr.table('sites') + reql = rr.table("sites") if args.job: try: job_id = int(args.job) except ValueError: job_id = args.job - reql = reql.get_all(job_id, index='job_id') + reql = reql.get_all(job_id, index="job_id") elif args.jobless: - reql = reql.filter(~r.row.has_fields('job_id')) + reql = reql.filter(~r.row.has_fields("job_id")) elif args.active: reql = reql.between( - ['ACTIVE', r.minval], ['ACTIVE', r.maxval], - index='sites_last_disclaimed') + ["ACTIVE", r.minval], ["ACTIVE", r.maxval], index="sites_last_disclaimed" + ) elif args.site: reql = reql.get_all(args.site) - logging.debug('querying rethinkdb: %s', reql) + logging.debug("querying rethinkdb: %s", reql) results = reql.run() if args.yaml: yaml.dump_all( - results, stream=sys.stdout, explicit_start=True, - default_flow_style=False) + results, stream=sys.stdout, explicit_start=True, default_flow_style=False + ) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) + def brozzler_list_pages(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) arg_parser.add_argument( - '--yaml', dest='yaml', action='store_true', help=( - 'yaml output (default is json)')) + "--yaml", + dest="yaml", + action="store_true", + help=("yaml output (default is json)"), + ) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( - '--job', dest='job', metavar='JOB_ID', help=( - 'list pages for all sites of a particular job')) + "--job", + dest="job", + metavar="JOB_ID", + help=("list pages for all sites of a particular job"), + ) group.add_argument( - '--site', dest='site', metavar='SITE_ID', help=( - 'list pages for the specified site')) + "--site", + dest="site", + metavar="SITE_ID", + help=("list pages for the specified site"), + ) # group.add_argument( # '--page', dest='page', metavar='PAGE_ID', help=( # 'list only the specified page')) group = arg_parser.add_mutually_exclusive_group() group.add_argument( - '--queued', dest='queued', action='store_true', help=( - 'limit to queued pages')) + "--queued", dest="queued", action="store_true", help=("limit to queued pages") + ) group.add_argument( - '--brozzled', dest='brozzled', action='store_true', help=( - 'limit to pages that have already been brozzled')) + "--brozzled", + dest="brozzled", + action="store_true", + help=("limit to pages that have already been brozzled"), + ) group.add_argument( - '--claimed', dest='claimed', action='store_true', help=( - 'limit to pages that are currently claimed by a brozzler ' - 'worker')) + "--claimed", + dest="claimed", + action="store_true", + help=("limit to pages that are currently claimed by a brozzler " "worker"), + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -592,8 +784,8 @@ def brozzler_list_pages(argv=None): job_id = int(args.job) except ValueError: job_id = args.job - reql = rr.table('sites').get_all(job_id, index='job_id')['id'] - logging.debug('querying rethinkb: %s', reql) + reql = rr.table("sites").get_all(job_id, index="job_id")["id"] + logging.debug("querying rethinkb: %s", reql) site_ids = reql.run() elif args.site: try: @@ -602,54 +794,77 @@ def brozzler_list_pages(argv=None): site_ids = [args.site] for site_id in site_ids: - reql = rr.table('pages') + reql = rr.table("pages") if args.queued: reql = reql.between( - [site_id, 0, r.minval], [site_id, 0, r.maxval], - index='least_hops') + [site_id, 0, r.minval], [site_id, 0, r.maxval], index="least_hops" + ) elif args.brozzled: reql = reql.between( - [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], - index='least_hops') + [site_id, 1, r.minval], + [site_id, r.maxval, r.maxval], + index="least_hops", + ) else: reql = reql.between( - [site_id, 0, r.minval], [site_id, r.maxval, r.maxval], - index='least_hops') + [site_id, 0, r.minval], + [site_id, r.maxval, r.maxval], + index="least_hops", + ) reql = reql.order_by(index="least_hops") if args.claimed: - reql = reql.filter({'claimed': True}) - logging.debug('querying rethinkb: %s', reql) + reql = reql.filter({"claimed": True}) + logging.debug("querying rethinkb: %s", reql) results = reql.run() if args.yaml: yaml.dump_all( - results, stream=sys.stdout, explicit_start=True, - default_flow_style=False) + results, + stream=sys.stdout, + explicit_start=True, + default_flow_style=False, + ) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) + def brozzler_purge(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - description='brozzler-purge - purge crawl state from rethinkdb', - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + description="brozzler-purge - purge crawl state from rethinkdb", + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( - '--job', dest='job', metavar='JOB_ID', help=( - 'purge crawl state from rethinkdb for a job, including all ' - 'sites and pages')) + "--job", + dest="job", + metavar="JOB_ID", + help=( + "purge crawl state from rethinkdb for a job, including all " + "sites and pages" + ), + ) group.add_argument( - '--site', dest='site', metavar='SITE_ID', help=( - 'purge crawl state from rethinkdb for a site, including all ' - 'pages')) + "--site", + dest="site", + metavar="SITE_ID", + help=("purge crawl state from rethinkdb for a site, including all " "pages"), + ) group.add_argument( - '--finished-before', dest='finished_before', metavar='YYYY-MM-DD', - help=('purge crawl state from rethinkdb for a jobs that ended ' - 'before this date')) + "--finished-before", + dest="finished_before", + metavar="YYYY-MM-DD", + help=( + "purge crawl state from rethinkdb for a jobs that ended " "before this date" + ), + ) arg_parser.add_argument( - '--force', dest='force', action='store_true', help=( - 'purge even if job or site is still has status ACTIVE')) + "--force", + dest="force", + action="store_true", + help=("purge even if job or site is still has status ACTIVE"), + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -665,162 +880,210 @@ def brozzler_purge(argv=None): job_id = args.job job = brozzler.Job.load(rr, job_id) if not job: - logging.fatal('no such job %r', job_id) + logging.fatal("no such job %r", job_id) sys.exit(1) - if job.status == 'ACTIVE': + if job.status == "ACTIVE": if args.force: logging.warning( - 'job %s has status ACTIVE, purging anyway because ' - '--force was supplied', job_id) + "job %s has status ACTIVE, purging anyway because " + "--force was supplied", + job_id, + ) else: logging.fatal( - 'refusing to purge job %s because status is ACTIVE ' - '(override with --force)', job_id) + "refusing to purge job %s because status is ACTIVE " + "(override with --force)", + job_id, + ) sys.exit(1) _purge_job(rr, job_id) elif args.site: site_id = args.site site = brozzler.Site.load(rr, site_id) if not site: - logging.fatal('no such job %r', job_id) + logging.fatal("no such job %r", job_id) sys.exit(1) - if site.status == 'ACTIVE': + if site.status == "ACTIVE": if args.force: logging.warning( - 'site %s has status ACTIVE, purging anyway because ' - '--force was supplied', site_id) + "site %s has status ACTIVE, purging anyway because " + "--force was supplied", + site_id, + ) else: logging.fatal( - 'refusing to purge site %s because status is ACTIVE ' - '(override with --force)', site_id) + "refusing to purge site %s because status is ACTIVE " + "(override with --force)", + site_id, + ) sys.exit(1) _purge_site(rr, site_id) elif args.finished_before: finished_before = datetime.datetime.strptime( - args.finished_before, '%Y-%m-%d').replace( - tzinfo=doublethink.UTC) - reql = rr.table('jobs').filter( - r.row['finished'].default(r.maxval).lt(finished_before).or_( - r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before))) - logging.debug( - 'retrieving jobs older than %s: %s', finished_before, reql) + args.finished_before, "%Y-%m-%d" + ).replace(tzinfo=doublethink.UTC) + reql = rr.table("jobs").filter( + r.row["finished"] + .default(r.maxval) + .lt(finished_before) + .or_( + r.row["starts_and_stops"] + .nth(-1)["stop"] + .default(r.maxval) + .lt(finished_before) + ) + ) + logging.debug("retrieving jobs older than %s: %s", finished_before, reql) for job in reql.run(): # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s', # job['id'], job.get('finished'), # job.get('starts_and_stops', [{'stop':None}])[-1]['stop']) - _purge_job(rr, job['id']) + _purge_job(rr, job["id"]) + def _purge_site(rr, site_id): - reql = rr.table('pages').between( - [site_id, r.minval, r.minval], - [site_id, r.maxval, r.maxval], - index='priority_by_site').delete() - logging.debug('purging pages for site %s: %s', site_id, reql) + reql = ( + rr.table("pages") + .between( + [site_id, r.minval, r.minval], + [site_id, r.maxval, r.maxval], + index="priority_by_site", + ) + .delete() + ) + logging.debug("purging pages for site %s: %s", site_id, reql) result = reql.run() - logging.info('purged pages for site %s: %s', site_id, result) + logging.info("purged pages for site %s: %s", site_id, result) - reql = rr.table('sites').get(site_id).delete() - logging.debug('purging site %s: %s', site_id, reql) + reql = rr.table("sites").get(site_id).delete() + logging.debug("purging site %s: %s", site_id, reql) result = reql.run() - logging.info('purged site %s: %s', site_id, result) + logging.info("purged site %s: %s", site_id, result) + def _purge_job(rr, job_id): - reql = rr.table('sites').get_all(job_id, index='job_id').get_field('id') - logging.debug('querying rethinkdb: %s', reql) + reql = rr.table("sites").get_all(job_id, index="job_id").get_field("id") + logging.debug("querying rethinkdb: %s", reql) site_ids = list(reql.run()) for site_id in site_ids: _purge_site(rr, site_id) - reql = rr.table('jobs').get(job_id).delete() - logging.debug('purging job %s: %s', job_id, reql) + reql = rr.table("jobs").get(job_id).delete() + logging.debug("purging job %s: %s", job_id, reql) result = reql.run() - logging.info('purged job %s: %s', job_id, result) + logging.info("purged job %s: %s", job_id, result) + def brozzler_list_captures(argv=None): - ''' + """ Handy utility for looking up entries in the rethinkdb "captures" table by url or sha1. - ''' + """ import urlcanon argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) arg_parser.add_argument( - '-p', '--prefix', dest='prefix', action='store_true', help=( - 'use prefix match for url (n.b. may not work as expected if ' - 'searching key has query string because canonicalization can ' - 'reorder query parameters)')) + "-p", + "--prefix", + dest="prefix", + action="store_true", + help=( + "use prefix match for url (n.b. may not work as expected if " + "searching key has query string because canonicalization can " + "reorder query parameters)" + ), + ) arg_parser.add_argument( - '--yaml', dest='yaml', action='store_true', help=( - 'yaml output (default is json)')) + "--yaml", + dest="yaml", + action="store_true", + help=("yaml output (default is json)"), + ) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) arg_parser.add_argument( - 'url_or_sha1', metavar='URL_or_SHA1', - help='url or sha1 to look up in captures table') + "url_or_sha1", + metavar="URL_or_SHA1", + help="url or sha1 to look up in captures table", + ) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) - if args.url_or_sha1[:5] == 'sha1:': + if args.url_or_sha1[:5] == "sha1:": if args.prefix: logging.warning( - 'ignoring supplied --prefix option which does not apply ' - 'to lookup by sha1') + "ignoring supplied --prefix option which does not apply " + "to lookup by sha1" + ) # assumes it's already base32 (XXX could detect if hex and convert) sha1base32 = args.url_or_sha1[5:].upper() - reql = rr.table('captures').between( - [sha1base32, r.minval, r.minval], - [sha1base32, r.maxval, r.maxval], - index='sha1_warc_type') - logging.debug('querying rethinkdb: %s', reql) + reql = rr.table("captures").between( + [sha1base32, r.minval, r.minval], + [sha1base32, r.maxval, r.maxval], + index="sha1_warc_type", + ) + logging.debug("querying rethinkdb: %s", reql) results = reql.run() else: - key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii') + key = urlcanon.semantic(args.url_or_sha1).surt().decode("ascii") abbr_start_key = key[:150] if args.prefix: # surt is necessarily ascii and \x7f is the last ascii character - abbr_end_key = key[:150] + '\x7f' - end_key = key + '\x7f' + abbr_end_key = key[:150] + "\x7f" + end_key = key + "\x7f" else: abbr_end_key = key[:150] end_key = key - reql = rr.table('captures').between( - [abbr_start_key, r.minval], - [abbr_end_key, r.maxval], - index='abbr_canon_surt_timestamp', right_bound='closed') - reql = reql.order_by(index='abbr_canon_surt_timestamp') + reql = rr.table("captures").between( + [abbr_start_key, r.minval], + [abbr_end_key, r.maxval], + index="abbr_canon_surt_timestamp", + right_bound="closed", + ) + reql = reql.order_by(index="abbr_canon_surt_timestamp") reql = reql.filter( - lambda capture: (capture['canon_surt'] >= key) - & (capture['canon_surt'] <= end_key)) - logging.debug('querying rethinkdb: %s', reql) + lambda capture: (capture["canon_surt"] >= key) + & (capture["canon_surt"] <= end_key) + ) + logging.debug("querying rethinkdb: %s", reql) results = reql.run() if args.yaml: yaml.dump_all( - results, stream=sys.stdout, explicit_start=True, - default_flow_style=False) + results, stream=sys.stdout, explicit_start=True, default_flow_style=False + ) else: for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) + def brozzler_stop_crawl(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=BetterArgumentDefaultsHelpFormatter) + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter, + ) group = arg_parser.add_mutually_exclusive_group(required=True) add_rethinkdb_options(arg_parser) group.add_argument( - '--job', dest='job_id', metavar='JOB_ID', help=( - 'request crawl stop for the specified job')) + "--job", + dest="job_id", + metavar="JOB_ID", + help=("request crawl stop for the specified job"), + ) group.add_argument( - '--site', dest='site_id', metavar='SITE_ID', help=( - 'request crawl stop for the specified site')) + "--site", + dest="site_id", + metavar="SITE_ID", + help=("request crawl stop for the specified site"), + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -834,7 +1097,7 @@ def brozzler_stop_crawl(argv=None): job_id = args.job_id job = brozzler.Job.load(rr, job_id) if not job: - logging.fatal('job not found with id=%r', job_id) + logging.fatal("job not found with id=%r", job_id) sys.exit(1) job.stop_requested = doublethink.utcnow() job.save() @@ -845,8 +1108,7 @@ def brozzler_stop_crawl(argv=None): site_id = args.site_id site = brozzler.Site.load(rr, site_id) if not site: - logging.fatal('site not found with id=%r', site_id) + logging.fatal("site not found with id=%r", site_id) sys.exit(1) site.stop_requested = doublethink.utcnow() site.save() - diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 6e85b3c0..16e34c16 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -1,4 +1,4 @@ -''' +""" brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api endspoints etc @@ -15,17 +15,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import sys + try: import flask except ImportError as e: logging.critical( - '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[dashboard]".\nSee README.rst for more information.', - type(e).__name__, e) + '%s: %s\n\nYou might need to run "pip install ' + 'brozzler[dashboard]".\nSee README.rst for more information.', + type(e).__name__, + e, + ) sys.exit(1) import doublethink import json @@ -41,33 +44,44 @@ # configure with environment variables SETTINGS = { - 'RETHINKDB_SERVERS': os.environ.get( - 'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','), - 'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'), - 'WAYBACK_BASEURL': os.environ.get( - 'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'), - 'DASHBOARD_PORT': os.environ.get('DASHBOARD_PORT', '8000'), - 'DASHBOARD_INTERFACE': os.environ.get('DASHBOARD_INTERFACE', 'localhost') + "RETHINKDB_SERVERS": os.environ.get( + "BROZZLER_RETHINKDB_SERVERS", "localhost" + ).split(","), + "RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"), + "WAYBACK_BASEURL": os.environ.get( + "WAYBACK_BASEURL", "http://localhost:8880/brozzler" + ), + "DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"), + "DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"), } -rr = doublethink.Rethinker( - SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB']) +rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"]) _svc_reg = None + + def service_registry(): global _svc_reg if not _svc_reg: _svc_reg = doublethink.ServiceRegistry(rr) return _svc_reg + @app.route("/api/sites//queued_count") @app.route("/api/site//queued_count") def queued_count(site_id): - reql = rr.table("pages").between( - [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], - index="priority_by_site").count() + reql = ( + rr.table("pages") + .between( + [site_id, 0, False, r.minval], + [site_id, 0, False, r.maxval], + index="priority_by_site", + ) + .count() + ) logging.debug("querying rethinkdb: %s", reql) count = reql.run() return flask.jsonify(count=count) + @app.route("/api/sites//queue") @app.route("/api/site//queue") def queue(site_id): @@ -75,38 +89,52 @@ def queue(site_id): start = flask.request.args.get("start", 0) end = flask.request.args.get("end", start + 90) reql = rr.table("pages").between( - [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], - index="priority_by_site")[start:end] + [site_id, 0, False, r.minval], + [site_id, 0, False, r.maxval], + index="priority_by_site", + )[start:end] logging.debug("querying rethinkdb: %s", reql) queue_ = reql.run() return flask.jsonify(queue_=list(queue_)) + @app.route("/api/sites//pages_count") @app.route("/api/site//pages_count") @app.route("/api/sites//page_count") @app.route("/api/site//page_count") def page_count(site_id): - reql = rr.table("pages").between( + reql = ( + rr.table("pages") + .between( [site_id, 1, False, r.minval], [site_id, r.maxval, False, r.maxval], - index="priority_by_site").count() + index="priority_by_site", + ) + .count() + ) logging.debug("querying rethinkdb: %s", reql) count = reql.run() return flask.jsonify(count=count) + @app.route("/api/sites//pages") @app.route("/api/site//pages") def pages(site_id): """Pages already crawled.""" start = int(flask.request.args.get("start", 0)) end = int(flask.request.args.get("end", start + 90)) - reql = rr.table("pages").between( - [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], - index="least_hops").order_by(index="least_hops")[start:end] + reql = ( + rr.table("pages") + .between( + [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops" + ) + .order_by(index="least_hops")[start:end] + ) logging.debug("querying rethinkdb: %s", reql) pages_ = reql.run() return flask.jsonify(pages=list(pages_)) + @app.route("/api/pages/") @app.route("/api/page/") def page(page_id): @@ -115,6 +143,7 @@ def page(page_id): page_ = reql.run() return flask.jsonify(page_) + @app.route("/api/pages//yaml") @app.route("/api/page//yaml") def page_yaml(page_id): @@ -122,8 +151,9 @@ def page_yaml(page_id): logging.debug("querying rethinkdb: %s", reql) page_ = reql.run() return app.response_class( - yaml.dump(page_, default_flow_style=False), - mimetype="application/yaml") + yaml.dump(page_, default_flow_style=False), mimetype="application/yaml" + ) + @app.route("/api/sites/") @app.route("/api/site/") @@ -135,6 +165,7 @@ def site(site_id): s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") return flask.jsonify(s) + @app.route("/api/sites//yaml") @app.route("/api/site//yaml") def site_yaml(site_id): @@ -142,8 +173,9 @@ def site_yaml(site_id): logging.debug("querying rethinkdb: %s", reql) site_ = reql.run() return app.response_class( - yaml.dump(site_, default_flow_style=False), - mimetype="application/yaml") + yaml.dump(site_, default_flow_style=False), mimetype="application/yaml" + ) + @app.route("/api/stats/") def stats(bucket): @@ -152,6 +184,7 @@ def stats(bucket): stats_ = reql.run() return flask.jsonify(stats_) + @app.route("/api/jobs//sites") @app.route("/api/job//sites") def sites(job_id): @@ -168,6 +201,7 @@ def sites(job_id): s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") return flask.jsonify(sites=sites_) + @app.route("/api/jobless-sites") def jobless_sites(): # XXX inefficient (unindexed) query @@ -180,6 +214,7 @@ def jobless_sites(): s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") return flask.jsonify(sites=sites_) + @app.route("/api/jobs/") @app.route("/api/job/") def job(job_id): @@ -192,6 +227,7 @@ def job(job_id): job_ = reql.run() return flask.jsonify(job_) + @app.route("/api/jobs//yaml") @app.route("/api/job//yaml") def job_yaml(job_id): @@ -203,19 +239,22 @@ def job_yaml(job_id): logging.debug("querying rethinkdb: %s", reql) job_ = reql.run() return app.response_class( - yaml.dump(job_, default_flow_style=False), - mimetype="application/yaml") + yaml.dump(job_, default_flow_style=False), mimetype="application/yaml" + ) + @app.route("/api/workers") def workers(): workers_ = service_registry().available_services("brozzler-worker") return flask.jsonify(workers=list(workers_)) + @app.route("/api/services") def services(): services_ = service_registry().available_services() return flask.jsonify(services=list(services_)) + @app.route("/api/jobs") def jobs(): reql = rr.table("jobs").order_by(r.desc("id")) @@ -223,20 +262,24 @@ def jobs(): jobs_ = list(reql.run()) return flask.jsonify(jobs=jobs_) + @app.route("/api/config") def config(): return flask.jsonify(config=SETTINGS) + @app.route("/api/") -@app.route("/api", defaults={"path":""}) +@app.route("/api", defaults={"path": ""}) def api404(path): flask.abort(404) + @app.route("/", defaults={"path": ""}) @app.route("/") def root(path): return flask.render_template("index.html") + try: import gunicorn.app.base from gunicorn.six import iteritems @@ -255,8 +298,12 @@ def __init__(self, app, options=None): def load_config(self): config = dict( - [(key, value) for key, value in iteritems(self.options) - if key in self.cfg.settings and value is not None]) + [ + (key, value) + for key, value in iteritems(self.options) + if key in self.cfg.settings and value is not None + ] + ) for key, value in iteritems(config): self.cfg.set(key.lower(), value) self.cfg.set("logger_class", BypassGunicornLogging) @@ -270,37 +317,42 @@ def run(**options): GunicornBrozzlerDashboard(app, options).run() except ImportError: + def run(): logging.info("running brozzler-dashboard using simple flask app.run") - app.run(host=SETTINGS['DASHBOARD_INTERFACE'], port=SETTINGS['DASHBOARD_PORT']) + app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"]) + def main(argv=None): import argparse import brozzler.cli + argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - prog=os.path.basename(argv[0]), - formatter_class=argparse.RawDescriptionHelpFormatter, - description=( - 'brozzler-dashboard - web application for viewing brozzler ' - 'crawl status'), - epilog=( - 'brozzler-dashboard has no command line options, but can be ' - 'configured using the following environment variables:\n\n' - ' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. ' - 'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: ' - 'localhost)\n' - ' BROZZLER_RETHINKDB_DB rethinkdb database name ' - '(default: brozzler)\n' - ' WAYBACK_BASEURL base url for constructing wayback ' - 'links (default http://localhost:8880/brozzler)' - ' DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n' - ' DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)')) + prog=os.path.basename(argv[0]), + formatter_class=argparse.RawDescriptionHelpFormatter, + description=( + "brozzler-dashboard - web application for viewing brozzler " "crawl status" + ), + epilog=( + "brozzler-dashboard has no command line options, but can be " + "configured using the following environment variables:\n\n" + " BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. " + "db0.foo.org,db0.foo.org:38015,db1.foo.org (default: " + "localhost)\n" + " BROZZLER_RETHINKDB_DB rethinkdb database name " + "(default: brozzler)\n" + " WAYBACK_BASEURL base url for constructing wayback " + "links (default http://localhost:8880/brozzler)" + " DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n" + " DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)" + ), + ) brozzler.cli.add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) brozzler.cli.configure_logging(args) run() + if __name__ == "__main__": main() - diff --git a/brozzler/easy.py b/brozzler/easy.py index dd988844..f8392f65 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all working together in a single process @@ -16,10 +16,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import sys import logging + try: import warcprox import warcprox.main @@ -30,9 +31,11 @@ import brozzler.dashboard except ImportError as e: logging.critical( - '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', - type(e).__name__, e) + '%s: %s\n\nYou might need to run "pip install ' + 'brozzler[easy]".\nSee README.rst for more information.', + type(e).__name__, + e, + ) sys.exit(1) import argparse import brozzler @@ -46,76 +49,112 @@ import traceback import socketserver + def _build_arg_parser(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( - formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter, - prog=os.path.basename(argv[0]), description=( - 'brozzler-easy - easy deployment of brozzler, with ' - 'brozzler-worker, warcprox, pywb, and brozzler-dashboard all ' - 'running in a single process')) + formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter, + prog=os.path.basename(argv[0]), + description=( + "brozzler-easy - easy deployment of brozzler, with " + "brozzler-worker, warcprox, pywb, and brozzler-dashboard all " + "running in a single process" + ), + ) # common args brozzler.cli.add_rethinkdb_options(arg_parser) arg_parser.add_argument( - '-d', '--warcs-dir', dest='warcs_dir', default='./warcs', - help='where to write warcs') + "-d", + "--warcs-dir", + dest="warcs_dir", + default="./warcs", + help="where to write warcs", + ) # warcprox args arg_parser.add_argument( - '-c', '--cacert', dest='cacert', - default='./%s-warcprox-ca.pem' % socket.gethostname(), - help=( - 'warcprox CA certificate file; if file does not exist, it ' - 'will be created')) + "-c", + "--cacert", + dest="cacert", + default="./%s-warcprox-ca.pem" % socket.gethostname(), + help=( + "warcprox CA certificate file; if file does not exist, it " + "will be created" + ), + ) arg_parser.add_argument( - '--certs-dir', dest='certs_dir', - default='./%s-warcprox-ca' % socket.gethostname(), - help='where warcprox will store and load generated certificates') + "--certs-dir", + dest="certs_dir", + default="./%s-warcprox-ca" % socket.gethostname(), + help="where warcprox will store and load generated certificates", + ) arg_parser.add_argument( - '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', - default=None, help=( - 'host:port of tor socks proxy, used only to connect to ' - '.onion sites')) + "--onion-tor-socks-proxy", + dest="onion_tor_socks_proxy", + default=None, + help=("host:port of tor socks proxy, used only to connect to " ".onion sites"), + ) # brozzler-worker args arg_parser.add_argument( - '-e', '--chrome-exe', dest='chrome_exe', - default=brozzler.cli.suggest_default_chrome_exe(), - help='executable to use to invoke chrome') + "-e", + "--chrome-exe", + dest="chrome_exe", + default=brozzler.cli.suggest_default_chrome_exe(), + help="executable to use to invoke chrome", + ) arg_parser.add_argument( - '-n', '--max-browsers', dest='max_browsers', - type=int, default=1, help=( - 'max number of chrome instances simultaneously ' - 'browsing pages')) + "-n", + "--max-browsers", + dest="max_browsers", + type=int, + default=1, + help=("max number of chrome instances simultaneously " "browsing pages"), + ) # pywb args arg_parser.add_argument( - '--pywb-address', dest='pywb_address', - default='0.0.0.0', - help='pywb wayback address to listen on') + "--pywb-address", + dest="pywb_address", + default="0.0.0.0", + help="pywb wayback address to listen on", + ) arg_parser.add_argument( - '--pywb-port', dest='pywb_port', type=int, - default=8880, help='pywb wayback port') + "--pywb-port", + dest="pywb_port", + type=int, + default=8880, + help="pywb wayback port", + ) # dashboard args arg_parser.add_argument( - '--dashboard-address', dest='dashboard_address', - default='localhost', - help='brozzler dashboard address to listen on') + "--dashboard-address", + dest="dashboard_address", + default="localhost", + help="brozzler dashboard address to listen on", + ) arg_parser.add_argument( - '--dashboard-port', dest='dashboard_port', - type=int, default=8881, help='brozzler dashboard port') + "--dashboard-port", + dest="dashboard_port", + type=int, + default=8881, + help="brozzler dashboard port", + ) # common at the bottom args brozzler.cli.add_common_options(arg_parser, argv) return arg_parser + class ThreadingWSGIServer( - socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer): + socketserver.ThreadingMixIn, wsgiref.simple_server.WSGIServer +): pass + class BrozzlerEasyController: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -123,25 +162,31 @@ def __init__(self, args): self.stop = threading.Event() self.args = args self.warcprox_controller = warcprox.controller.WarcproxController( - self._warcprox_opts(args)) + self._warcprox_opts(args) + ) self.brozzler_worker = self._init_brozzler_worker(args) self.pywb_httpd = self._init_pywb(args) self.dashboard_httpd = self._init_brozzler_dashboard(args) def _init_brozzler_dashboard(self, args): return wsgiref.simple_server.make_server( - args.dashboard_address, args.dashboard_port, - brozzler.dashboard.app, ThreadingWSGIServer) + args.dashboard_address, + args.dashboard_port, + brozzler.dashboard.app, + ThreadingWSGIServer, + ) def _init_brozzler_worker(self, args): - rr = doublethink.Rethinker( - args.rethinkdb_servers.split(","), args.rethinkdb_db) + rr = doublethink.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( - frontier, service_registry, chrome_exe=args.chrome_exe, - proxy='%s:%s' % self.warcprox_controller.proxy.server_address, - max_browsers=args.max_browsers) + frontier, + service_registry, + chrome_exe=args.chrome_exe, + proxy="%s:%s" % self.warcprox_controller.proxy.server_address, + max_browsers=args.max_browsers, + ) return worker def _init_pywb(self, args): @@ -152,66 +197,67 @@ def _init_pywb(self, args): brozzler.pywb.monkey_patch_fuzzy_query() brozzler.pywb.monkey_patch_calc_search_range() - if args.warcs_dir.endswith('/'): + if args.warcs_dir.endswith("/"): warcs_dir = args.warcs_dir else: - warcs_dir = args.warcs_dir + '/' + warcs_dir = args.warcs_dir + "/" conf = { - 'collections': { - 'brozzler': { - 'index_paths': brozzler.pywb.RethinkCDXSource( + "collections": { + "brozzler": { + "index_paths": brozzler.pywb.RethinkCDXSource( servers=args.rethinkdb_servers.split(","), - db=args.rethinkdb_db, table='captures') + db=args.rethinkdb_db, + table="captures", + ) }, }, # 'enable_http_proxy': True, # 'enable_memento': True, - 'archive_paths': warcs_dir, - 'enable_cdx_api': True, - 'framed_replay': True, - 'port': args.pywb_port, - 'enable_auto_colls': False, + "archive_paths": warcs_dir, + "enable_cdx_api": True, + "framed_replay": True, + "port": args.pywb_port, + "enable_auto_colls": False, } wsgi_app = pywb.framework.wsgi_wrappers.init_app( - pywb.webapp.pywb_init.create_wb_router, config=conf, - load_yaml=False) + pywb.webapp.pywb_init.create_wb_router, config=conf, load_yaml=False + ) # disable is_hop_by_hop restrictions wsgiref.handlers.is_hop_by_hop = lambda x: False return wsgiref.simple_server.make_server( - args.pywb_address, args.pywb_port, wsgi_app, - ThreadingWSGIServer) + args.pywb_address, args.pywb_port, wsgi_app, ThreadingWSGIServer + ) def start(self): - self.logger.info('starting warcprox') + self.logger.info("starting warcprox") self.warcprox_controller.start() # XXX wait til fully started? - self.logger.info('starting brozzler-worker') + self.logger.info("starting brozzler-worker") self.brozzler_worker.start() - self.logger.info( - 'starting pywb at %s:%s', *self.pywb_httpd.server_address) + self.logger.info("starting pywb at %s:%s", *self.pywb_httpd.server_address) threading.Thread(target=self.pywb_httpd.serve_forever).start() self.logger.info( - 'starting brozzler-dashboard at %s:%s', - *self.dashboard_httpd.server_address) + "starting brozzler-dashboard at %s:%s", *self.dashboard_httpd.server_address + ) threading.Thread(target=self.dashboard_httpd.serve_forever).start() def shutdown(self): - self.logger.info('shutting down brozzler-dashboard') + self.logger.info("shutting down brozzler-dashboard") self.dashboard_httpd.shutdown() - self.logger.info('shutting down brozzler-worker') + self.logger.info("shutting down brozzler-worker") self.brozzler_worker.shutdown_now() # brozzler-worker is fully shut down at this point - self.logger.info('shutting down pywb') + self.logger.info("shutting down pywb") self.pywb_httpd.shutdown() - self.logger.info('shutting down warcprox') + self.logger.info("shutting down warcprox") self.warcprox_controller.shutdown() def wait_for_shutdown_request(self): @@ -222,14 +268,14 @@ def wait_for_shutdown_request(self): self.shutdown() def _warcprox_opts(self, args): - ''' + """ Takes args as produced by the argument parser built by _build_arg_parser and builds warcprox arguments object suitable to pass to warcprox.main.init_controller. Copies some arguments, renames some, populates some with defaults appropriate for brozzler-easy, etc. - ''' + """ warcprox_opts = warcprox.Options() - warcprox_opts.address = 'localhost' + warcprox_opts.address = "localhost" # let the OS choose an available port; discover it later using # sock.getsockname()[1] warcprox_opts.port = 0 @@ -237,17 +283,18 @@ def _warcprox_opts(self, args): warcprox_opts.certs_dir = args.certs_dir warcprox_opts.directory = args.warcs_dir warcprox_opts.gzip = True - warcprox_opts.prefix = 'brozzler' - warcprox_opts.size = 1000 * 1000* 1000 + warcprox_opts.prefix = "brozzler" + warcprox_opts.size = 1000 * 1000 * 1000 warcprox_opts.rollover_idle_time = 3 * 60 - warcprox_opts.digest_algorithm = 'sha1' + warcprox_opts.digest_algorithm = "sha1" warcprox_opts.base32 = True warcprox_opts.stats_db_file = None warcprox_opts.playback_port = None warcprox_opts.playback_index_db_file = None - warcprox_opts.rethinkdb_big_table_url = ( - 'rethinkdb://%s/%s/captures' % ( - args.rethinkdb_servers, args.rethinkdb_db)) + warcprox_opts.rethinkdb_big_table_url = "rethinkdb://%s/%s/captures" % ( + args.rethinkdb_servers, + args.rethinkdb_db, + ) warcprox_opts.queue_size = 500 warcprox_opts.max_threads = None warcprox_opts.profile = False @@ -259,9 +306,11 @@ def dump_state(self, signum=None, frame=None): for th in threading.enumerate(): state_strs.append(str(th)) stack = traceback.format_stack(sys._current_frames()[th.ident]) - state_strs.append(''.join(stack)) - logging.warning('dumping state (caught signal {})\n{}'.format( - signum, '\n'.join(state_strs))) + state_strs.append("".join(stack)) + logging.warning( + "dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs)) + ) + def main(argv=None): argv = argv or sys.argv @@ -271,8 +320,8 @@ def main(argv=None): brozzler.chrome.check_version(args.chrome_exe) controller = BrozzlerEasyController(args) - signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) - signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGTERM, lambda a, b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a, b: controller.stop.set()) signal.signal(signal.SIGQUIT, controller.dump_state) controller.start() controller.wait_for_shutdown_request() diff --git a/brozzler/frontier.py b/brozzler/frontier.py index ac270471..afb2a574 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -1,4 +1,4 @@ -''' +""" brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages Copyright (C) 2014-2018 Internet Archive @@ -14,7 +14,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import brozzler @@ -27,9 +27,11 @@ r = rdb.RethinkDB() + class UnexpectedDbResult(Exception): pass + class RethinkDbFrontier: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -47,40 +49,49 @@ def _ensure_db(self): tables = self.rr.table_list().run() if not "sites" in tables: self.logger.info( - "creating rethinkdb table 'sites' in database %r", - self.rr.dbname) + "creating rethinkdb table 'sites' in database %r", self.rr.dbname + ) self.rr.table_create( - "sites", shards=self.shards, replicas=self.replicas).run() - self.rr.table("sites").index_create("sites_last_disclaimed", [ - r.row["status"], r.row["last_disclaimed"]]).run() + "sites", shards=self.shards, replicas=self.replicas + ).run() + self.rr.table("sites").index_create( + "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]] + ).run() self.rr.table("sites").index_create("job_id").run() if not "pages" in tables: self.logger.info( - "creating rethinkdb table 'pages' in database %r", - self.rr.dbname) + "creating rethinkdb table 'pages' in database %r", self.rr.dbname + ) self.rr.table_create( - "pages", shards=self.shards, replicas=self.replicas).run() - self.rr.table("pages").index_create("priority_by_site", [ - r.row["site_id"], r.row["brozzle_count"], - r.row["claimed"], r.row["priority"]]).run() + "pages", shards=self.shards, replicas=self.replicas + ).run() + self.rr.table("pages").index_create( + "priority_by_site", + [ + r.row["site_id"], + r.row["brozzle_count"], + r.row["claimed"], + r.row["priority"], + ], + ).run() # this index is for displaying pages in a sensible order in the web # console - self.rr.table("pages").index_create("least_hops", [ - r.row["site_id"], r.row["brozzle_count"], - r.row["hops_from_seed"]]).run() + self.rr.table("pages").index_create( + "least_hops", + [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]], + ).run() if not "jobs" in tables: self.logger.info( - "creating rethinkdb table 'jobs' in database %r", - self.rr.dbname) + "creating rethinkdb table 'jobs' in database %r", self.rr.dbname + ) self.rr.table_create( - "jobs", shards=self.shards, replicas=self.replicas).run() + "jobs", shards=self.shards, replicas=self.replicas + ).run() def _vet_result(self, result, **kwargs): # self.logger.debug("vetting expected=%s result=%s", kwargs, result) # {'replaced': 0, 'errors': 0, 'skipped': 0, 'inserted': 1, 'deleted': 0, 'generated_keys': ['292859c1-4926-4b27-9d87-b2c367667058'], 'unchanged': 0} - for k in [ - "replaced", "errors", "skipped", "inserted", "deleted", - "unchanged"]: + for k in ["replaced", "errors", "skipped", "inserted", "deleted", "unchanged"]: if k in kwargs: expected = kwargs[k] else: @@ -88,81 +99,110 @@ def _vet_result(self, result, **kwargs): if isinstance(expected, list): if result.get(k) not in kwargs[k]: raise UnexpectedDbResult( - "expected %r to be one of %r in %r" % ( - k, expected, result)) + "expected %r to be one of %r in %r" % (k, expected, result) + ) else: if result.get(k) != expected: - raise UnexpectedDbResult("expected %r to be %r in %r" % ( - k, expected, result)) + raise UnexpectedDbResult( + "expected %r to be %r in %r" % (k, expected, result) + ) def claim_sites(self, n=1): - self.logger.trace('claiming up to %s sites to brozzle', n) + self.logger.trace("claiming up to %s sites to brozzle", n) result = ( - self.rr.table('sites').get_all(r.args( - r.db(self.rr.dbname).table('sites', read_mode='majority') - .between( - ['ACTIVE', r.minval], ['ACTIVE', r.maxval], - index='sites_last_disclaimed') - .order_by(r.desc('claimed'), 'last_disclaimed') - .fold( - {}, lambda acc, site: acc.merge( - r.branch( - site.has_fields('job_id'), - r.object( - site['job_id'].coerce_to('string'), - acc[site['job_id'].coerce_to('string')].default(0).add(1)), - {})), - emit=lambda acc, site, new_acc: r.branch( - r.and_( - r.or_( - site['claimed'].not_(), - site['last_claimed'].lt(r.now().sub(60*60))), - r.or_( - site.has_fields('max_claimed_sites').not_(), - new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))), - [site['id']], [])) - .limit(n))) + self.rr.table("sites") + .get_all( + r.args( + r.db(self.rr.dbname) + .table("sites", read_mode="majority") + .between( + ["ACTIVE", r.minval], + ["ACTIVE", r.maxval], + index="sites_last_disclaimed", + ) + .order_by(r.desc("claimed"), "last_disclaimed") + .fold( + {}, + lambda acc, site: acc.merge( + r.branch( + site.has_fields("job_id"), + r.object( + site["job_id"].coerce_to("string"), + acc[site["job_id"].coerce_to("string")] + .default(0) + .add(1), + ), + {}, + ) + ), + emit=lambda acc, site, new_acc: r.branch( + r.and_( + r.or_( + site["claimed"].not_(), + site["last_claimed"].lt(r.now().sub(60 * 60)), + ), + r.or_( + site.has_fields("max_claimed_sites").not_(), + new_acc[site["job_id"].coerce_to("string")].le( + site["max_claimed_sites"] + ), + ), + ), + [site["id"]], + [], + ), + ) + .limit(n) + ) + ) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 r.branch( r.or_( - r.row['claimed'].not_(), - r.row['last_claimed'].lt(r.now().sub(60*60))), - {'claimed': True, 'last_claimed': r.now()}, - {}), - return_changes=True)).run() + r.row["claimed"].not_(), + r.row["last_claimed"].lt(r.now().sub(60 * 60)), + ), + {"claimed": True, "last_claimed": r.now()}, + {}, + ), + return_changes=True, + ) + ).run() self._vet_result( - result, replaced=list(range(n+1)), - unchanged=list(range(n+1))) + result, replaced=list(range(n + 1)), unchanged=list(range(n + 1)) + ) sites = [] for i in range(result["replaced"]): if result["changes"][i]["old_val"]["claimed"]: self.logger.warning( - "re-claimed site that was still marked 'claimed' " - "because it was last claimed a long time ago " - "at %s, and presumably some error stopped it from " - "being disclaimed", - result["changes"][i]["old_val"]["last_claimed"]) + "re-claimed site that was still marked 'claimed' " + "because it was last claimed a long time ago " + "at %s, and presumably some error stopped it from " + "being disclaimed", + result["changes"][i]["old_val"]["last_claimed"], + ) site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) sites.append(site) - self.logger.debug('claimed %s sites', len(sites)) + self.logger.debug("claimed %s sites", len(sites)) if sites: return sites else: raise brozzler.NothingToClaim def enforce_time_limit(self, site): - ''' + """ Raises `brozzler.ReachedTimeLimit` if appropriate. - ''' - if (site.time_limit and site.time_limit > 0 - and site.elapsed() > site.time_limit): + """ + if site.time_limit and site.time_limit > 0 and site.elapsed() > site.time_limit: self.logger.debug( - "site FINISHED_TIME_LIMIT! time_limit=%s " - "elapsed=%s %s", site.time_limit, site.elapsed(), site) + "site FINISHED_TIME_LIMIT! time_limit=%s " "elapsed=%s %s", + site.time_limit, + site.elapsed(), + site, + ) raise brozzler.ReachedTimeLimit def claim_page(self, site, worker_id): @@ -170,26 +210,37 @@ def claim_page(self, site, worker_id): # brozzler-worker can be working on a site at a time, and that would # have to be the worker calling this method, so if something is claimed # already, it must have been left that way because of some error - result = self.rr.table("pages").between( + result = ( + self.rr.table("pages") + .between( [site.id, 0, r.minval, r.minval], [site.id, 0, r.maxval, r.maxval], - index="priority_by_site").order_by( - index=r.desc("priority_by_site")).limit( - 1).update({ - "claimed":True, - "last_claimed_by":worker_id}, - return_changes="always").run() - self._vet_result(result, unchanged=[0,1], replaced=[0,1]) + index="priority_by_site", + ) + .order_by(index=r.desc("priority_by_site")) + .limit(1) + .update( + {"claimed": True, "last_claimed_by": worker_id}, return_changes="always" + ) + .run() + ) + self._vet_result(result, unchanged=[0, 1], replaced=[0, 1]) if result["unchanged"] == 0 and result["replaced"] == 0: raise brozzler.NothingToClaim else: return brozzler.Page(self.rr, result["changes"][0]["new_val"]) def has_outstanding_pages(self, site): - results_iter = self.rr.table("pages").between( + results_iter = ( + self.rr.table("pages") + .between( [site.id, 0, r.minval, r.minval], [site.id, 0, r.maxval, r.maxval], - index="priority_by_site").limit(1).run() + index="priority_by_site", + ) + .limit(1) + .run() + ) return len(list(results_iter)) > 0 def completed_page(self, site, page): @@ -202,22 +253,24 @@ def completed_page(self, site, page): site.save() def active_jobs(self): - results = self.rr.table("jobs").filter({"status":"ACTIVE"}).run() + results = self.rr.table("jobs").filter({"status": "ACTIVE"}).run() for result in results: yield brozzler.Job(self.rr, result) def honor_stop_request(self, site): """Raises brozzler.CrawlStopped if stop has been requested.""" site.refresh() - if (site.stop_requested - and site.stop_requested <= doublethink.utcnow()): + if site.stop_requested and site.stop_requested <= doublethink.utcnow(): self.logger.info("stop requested for site %s", site.id) raise brozzler.CrawlStopped if site.job_id: job = brozzler.Job.load(self.rr, site.job_id) - if (job and job.stop_requested - and job.stop_requested <= doublethink.utcnow()): + if ( + job + and job.stop_requested + and job.stop_requested <= doublethink.utcnow() + ): self.logger.info("stop requested for job %s", site.job_id) raise brozzler.CrawlStopped @@ -239,8 +292,7 @@ def _maybe_finish_job(self, job_id): return False n += 1 - self.logger.info( - "all %s sites finished, job %s is FINISHED!", n, job.id) + self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id) job.finish() job.save() return True @@ -270,13 +322,11 @@ def disclaim_site(self, site, page=None): def resume_job(self, job): job.status = "ACTIVE" job.stop_requested = None - job.starts_and_stops.append( - {"start":doublethink.utcnow(), "stop":None}) + job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None}) job.save() for site in self.job_sites(job.id): site.status = "ACTIVE" - site.starts_and_stops.append( - {"start":doublethink.utcnow(), "stop":None}) + site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None}) site.save() def resume_site(self, site): @@ -285,51 +335,55 @@ def resume_site(self, site): job = brozzler.Job.load(self.rr, site.job_id) job.status = "ACTIVE" site.stop_requested = None - job.starts_and_stops.append( - {"start":doublethink.utcnow(), "stop":None}) + job.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None}) job.save() site.status = "ACTIVE" - site.starts_and_stops.append( - {"start":doublethink.utcnow(), "stop":None}) + site.starts_and_stops.append({"start": doublethink.utcnow(), "stop": None}) site.save() def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) - hashtag = (url_for_crawling.hash_sign - + url_for_crawling.fragment).decode('utf-8') + hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode( + "utf-8" + ) urlcanon.canon.remove_fragment(url_for_crawling) - page = brozzler.Page(self.rr, { - 'url': str(url_for_crawling), - 'site_id': site.id, - 'job_id': site.job_id, - 'hops_from_seed': parent_page.hops_from_seed + 1, - 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", - 'via_page_id': parent_page.id, - 'via_page_url': parent_page.url, - 'hops_off_surt': hops_off, - 'hashtags': [hashtag] if hashtag else []}) + page = brozzler.Page( + self.rr, + { + "url": str(url_for_crawling), + "site_id": site.id, + "job_id": site.job_id, + "hops_from_seed": parent_page.hops_from_seed + 1, + "hop_path": str(parent_page.hop_path if parent_page.hop_path else "") + + "L", + "via_page_id": parent_page.id, + "via_page_url": parent_page.url, + "hops_off_surt": hops_off, + "hashtags": [hashtag] if hashtag else [], + }, + ) return page def _merge_page(self, existing_page, fresh_page): - ''' + """ Utility method for merging info from `brozzler.Page` instances representing the same url but with possibly different metadata. - ''' + """ existing_page.priority += fresh_page.priority - existing_page.hashtags = list(set( - (existing_page.hashtags or []) + (fresh_page.hashtags or []))) - existing_page.hops_off = min( - existing_page.hops_off, fresh_page.hops_off) + existing_page.hashtags = list( + set((existing_page.hashtags or []) + (fresh_page.hashtags or [])) + ) + existing_page.hops_off = min(existing_page.hops_off, fresh_page.hops_off) def _scope_and_enforce_robots(self, site, parent_page, outlinks): - ''' + """ Returns tuple ( dict of {page_id: Page} of fresh `brozzler.Page` representing in scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). - ''' + """ pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() @@ -337,17 +391,18 @@ def _scope_and_enforce_robots(self, site, parent_page, outlinks): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) decision = site.accept_reject_or_neither( - url_for_scoping, parent_page=parent_page) + url_for_scoping, parent_page=parent_page + ) if decision is True: hops_off = 0 elif decision is None: - decision = parent_page.hops_off < site.scope.get( - 'max_hops_off', 0) + decision = parent_page.hops_off < site.scope.get("max_hops_off", 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( - site, parent_page, url, hops_off) + site, parent_page, url, hops_off + ) if fresh_page.id in pages: self._merge_page(pages[fresh_page.id], fresh_page) else: @@ -359,31 +414,32 @@ def _scope_and_enforce_robots(self, site, parent_page, outlinks): return pages, blocked, out_of_scope def scope_and_schedule_outlinks(self, site, parent_page, outlinks): - decisions = {'accepted':set(),'blocked':set(),'rejected':set()} - counts = {'added':0,'updated':0,'rejected':0,'blocked':0} + decisions = {"accepted": set(), "blocked": set(), "rejected": set()} + counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots( - site, parent_page, outlinks) - decisions['blocked'] = blocked - decisions['rejected'] = out_of_scope - counts['blocked'] += len(blocked) - counts['rejected'] += len(out_of_scope) + site, parent_page, outlinks + ) + decisions["blocked"] = blocked + decisions["rejected"] = out_of_scope + counts["blocked"] += len(blocked) + counts["rejected"] += len(out_of_scope) # get existing pages from rethinkdb - results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() - pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} + results = self.rr.table("pages").get_all(*fresh_pages.keys()).run() + pages = {doc["id"]: brozzler.Page(self.rr, doc) for doc in results} # build list of pages to save, consisting of new pages, and existing # pages updated with higher priority and new hashtags for fresh_page in fresh_pages.values(): - decisions['accepted'].add(fresh_page.url) + decisions["accepted"].add(fresh_page.url) if fresh_page.id in pages: page = pages[fresh_page.id] self._merge_page(page, fresh_page) - counts['updated'] += 1 + counts["updated"] += 1 else: pages[fresh_page.id] = fresh_page - counts['added'] += 1 + counts["added"] += 1 # make sure we're not stepping on our own toes in case we have a link # back to parent_page, which I think happens because of hashtags @@ -396,19 +452,22 @@ def scope_and_schedule_outlinks(self, site, parent_page, outlinks): # there can be many pages and each one can be very large (many videos, # in and out of scope links, etc) l = list(pages.values()) - for batch in (l[i:i+50] for i in range(0, len(l), 50)): + for batch in (l[i : i + 50] for i in range(0, len(l), 50)): try: - self.logger.debug( - 'inserting/replacing batch of %s pages', len(batch)) - reql = self.rr.table('pages').insert(batch, conflict='replace') + self.logger.debug("inserting/replacing batch of %s pages", len(batch)) + reql = self.rr.table("pages").insert(batch, conflict="replace") self.logger.trace( - 'running query self.rr.table("pages").insert(%r, ' - 'conflict="replace")', batch) + 'running query self.rr.table("pages").insert(%r, ' + 'conflict="replace")', + batch, + ) result = reql.run() except Exception as e: self.logger.error( - 'problem inserting/replacing batch of %s pages', - len(batch), exc_info=True) + "problem inserting/replacing batch of %s pages", + len(batch), + exc_info=True, + ) parent_page.outlinks = {} for k in decisions: @@ -416,43 +475,56 @@ def scope_and_schedule_outlinks(self, site, parent_page, outlinks): parent_page.save() self.logger.info( - '%s new links added, %s existing links updated, %s links ' - 'rejected, %s links blocked by robots from %s', - counts['added'], counts['updated'], counts['rejected'], - counts['blocked'], parent_page) + "%s new links added, %s existing links updated, %s links " + "rejected, %s links blocked by robots from %s", + counts["added"], + counts["updated"], + counts["rejected"], + counts["blocked"], + parent_page, + ) def reached_limit(self, site, e): self.logger.info("reached_limit site=%s e=%s", site, e) assert isinstance(e, brozzler.ReachedLimit) - if (site.reached_limit - and site.reached_limit != e.warcprox_meta["reached-limit"]): + if ( + site.reached_limit + and site.reached_limit != e.warcprox_meta["reached-limit"] + ): self.logger.warning( - "reached limit %s but site had already reached limit %s", - e.warcprox_meta["reached-limit"], self.reached_limit) + "reached limit %s but site had already reached limit %s", + e.warcprox_meta["reached-limit"], + self.reached_limit, + ) else: site.reached_limit = e.warcprox_meta["reached-limit"] self.finished(site, "FINISHED_REACHED_LIMIT") def job_sites(self, job_id): - results = self.rr.table('sites').get_all(job_id, index="job_id").run() + results = self.rr.table("sites").get_all(job_id, index="job_id").run() for result in results: yield brozzler.Site(self.rr, result) def seed_page(self, site_id): - results = self.rr.table("pages").between( + results = ( + self.rr.table("pages") + .between( [site_id, r.minval, r.minval, r.minval], [site_id, r.maxval, r.maxval, r.maxval], - index="priority_by_site").filter({"hops_from_seed":0}).run() + index="priority_by_site", + ) + .filter({"hops_from_seed": 0}) + .run() + ) pages = list(results) if len(pages) > 1: - self.logger.warning( - "more than one seed page for site_id %s ?", site_id) + self.logger.warning("more than one seed page for site_id %s ?", site_id) if len(pages) < 1: return None return brozzler.Page(self.rr, pages[0]) def site_pages(self, site_id, brozzled=None): - ''' + """ Args: site_id (str or int): brozzled (bool): if true, results include only pages that have @@ -460,16 +532,14 @@ def site_pages(self, site_id, brozzled=None): not been brozzled; and if None (the default), all pages Returns: iterator of brozzler.Page - ''' + """ query = self.rr.table("pages").between( - [site_id, 1 if brozzled is True else 0, - r.minval, r.minval], - [site_id, 0 if brozzled is False else r.maxval, - r.maxval, r.maxval], - index="priority_by_site") + [site_id, 1 if brozzled is True else 0, r.minval, r.minval], + [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval], + index="priority_by_site", + ) self.logger.trace("running query: %r", query) results = query.run() for result in results: self.logger.trace("yielding result: %r", result) yield brozzler.Page(self.rr, result) - diff --git a/brozzler/model.py b/brozzler/model.py index 8e35d0a4..b0f216d5 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -1,4 +1,4 @@ -''' +""" brozzler/models.py - model classes representing jobs, sites, and pages, with related logic @@ -15,7 +15,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import brozzler import base64 @@ -36,15 +36,18 @@ import zlib from typing import Optional + def load_schema(): - schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') + schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml") with open(schema_file) as f: return yaml.safe_load(f) + class JobValidator(cerberus.Validator): def _validate_type_url(self, value): url = urllib.parse.urlparse(value) - return url.scheme in ('http', 'https', 'ftp') + return url.scheme in ("http", "https", "ftp") + class InvalidJobConf(Exception): def __init__(self, validator): @@ -53,15 +56,17 @@ def __init__(self, validator): # Cerberus does a nice job hiding the bad value. In the case I # debugged, I found it here. Maybe there's a better way to see it. value = validator._errors[0].info[0][0].info[0][0].value - self.errors['bad value'] = value + self.errors["bad value"] = value except: value = None + def validate_conf(job_conf, schema=load_schema()): v = JobValidator(schema) if not v.validate(job_conf, normalize=False): raise InvalidJobConf(v) + def merge(a, b): if isinstance(a, dict) and isinstance(b, dict): merged = dict(a) @@ -75,19 +80,22 @@ def merge(a, b): else: return a + def new_job_file(frontier, job_conf_file): - '''Returns new Job.''' + """Returns new Job.""" logging.info("loading %s", job_conf_file) with open(job_conf_file) as f: job_conf = yaml.safe_load(f) return new_job(frontier, job_conf) + def new_job(frontier, job_conf): - '''Returns new Job.''' + """Returns new Job.""" validate_conf(job_conf) - job = Job(frontier.rr, { - "conf": job_conf, "status": "ACTIVE", - "started": doublethink.utcnow()}) + job = Job( + frontier.rr, + {"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}, + ) if "id" in job_conf: job.id = job_conf["id"] if "max_claimed_sites" in job_conf: @@ -108,32 +116,40 @@ def new_job(frontier, job_conf): # insert in batches to avoid this error # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: - for batch in (pages[i:i+500] for i in range(0, len(pages), 500)): - logging.info('inserting batch of %s pages', len(batch)) - result = frontier.rr.table('pages').insert(batch).run() - for batch in (sites[i:i+100] for i in range(0, len(sites), 100)): - logging.info('inserting batch of %s sites', len(batch)) - result = frontier.rr.table('sites').insert(batch).run() - logging.info('job %s fully started', job.id) + for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)): + logging.info("inserting batch of %s pages", len(batch)) + result = frontier.rr.table("pages").insert(batch).run() + for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)): + logging.info("inserting batch of %s sites", len(batch)) + result = frontier.rr.table("sites").insert(batch).run() + logging.info("job %s fully started", job.id) return job + def new_seed_page(frontier, site): url = urlcanon.parse_url(site.seed) hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) - page = brozzler.Page(frontier.rr, { - "url": str(url), - "site_id": site.get("id"), - "job_id": site.get("job_id"), - "hops_from_seed": 0, - "priority": 1000, - "needs_robots_check": True, - "hop_path": None}) + page = brozzler.Page( + frontier.rr, + { + "url": str(url), + "site_id": site.get("id"), + "job_id": site.get("job_id"), + "hops_from_seed": 0, + "priority": 1000, + "needs_robots_check": True, + "hop_path": None, + }, + ) if hashtag: - page.hashtags = [hashtag,] + page.hashtags = [ + hashtag, + ] return page + def new_site(frontier, site): logging.info("new site %s", site) site.id = site.id or str(uuid.uuid4()) @@ -148,9 +164,10 @@ def new_site(frontier, site): # finally block because we want to insert the Site no matter what site.save() + class ElapsedMixIn(object): def elapsed(self): - ''' + """ Returns elapsed crawl time as a float in seconds. This metric includes all the time that a site was in active rotation, @@ -158,21 +175,22 @@ def elapsed(self): In contrast `Site.active_brozzling_time` only counts time when a brozzler worker claimed the site and was actively brozzling it. - ''' + """ dt = 0 for ss in self.starts_and_stops[:-1]: - if ss['stop']: - dt += (ss['stop'] - ss['start']).total_seconds() + if ss["stop"]: + dt += (ss["stop"] - ss["start"]).total_seconds() else: self.logger.warning("missing expected ss['stop']") - dt += (doublethink.utcnow() - ss['start']).total_seconds() + dt += (doublethink.utcnow() - ss["start"]).total_seconds() ss = self.starts_and_stops[-1] - if ss['stop']: - dt += (ss['stop'] - ss['start']).total_seconds() - else: # crawl is active - dt += (doublethink.utcnow() - ss['start']).total_seconds() + if ss["stop"]: + dt += (ss["stop"] - ss["start"]).total_seconds() + else: # crawl is active + dt += (doublethink.utcnow() - ss["start"]).total_seconds() return dt + class Job(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) table = "jobs" @@ -181,29 +199,30 @@ def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" if not "starts_and_stops" in self: - if self.get("started"): # backward compatibility - self.starts_and_stops = [{ - "start": self.get("started"), - "stop": self.get("finished")}] + if self.get("started"): # backward compatibility + self.starts_and_stops = [ + {"start": self.get("started"), "stop": self.get("finished")} + ] del self["started"] if "finished" in self: del self["finished"] else: - self.starts_and_stops = [ - {"start":doublethink.utcnow(),"stop":None}] + self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}] def finish(self): if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]: self.logger.error( - "job is already finished status=%s " - "starts_and_stops[-1]['stop']=%s", self.status, - self.starts_and_stops[-1]["stop"]) + "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s", + self.status, + self.starts_and_stops[-1]["stop"], + ) self.status = "FINISHED" self.starts_and_stops[-1]["stop"] = doublethink.utcnow() + class Site(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) - table = 'sites' + table = "sites" def populate_defaults(self): if not "status" in self: @@ -225,26 +244,26 @@ def populate_defaults(self): del self.scope["surt"] # backward compatibility - if ("max_hops_off_surt" in self.scope - and not "max_hops_off" in self.scope): + if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] if self.seed: self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) + brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") + ) if not "starts_and_stops" in self: - if self.get("start_time"): # backward compatibility - self.starts_and_stops = [{ - "start":self.get("start_time"),"stop":None}] + if self.get("start_time"): # backward compatibility + self.starts_and_stops = [ + {"start": self.get("start_time"), "stop": None} + ] if self.get("status") != "ACTIVE": self.starts_and_stops[0]["stop"] = self.last_disclaimed del self["start_time"] else: - self.starts_and_stops = [ - {"start":doublethink.utcnow(),"stop":None}] + self.starts_and_stops = [{"start": doublethink.utcnow(), "stop": None}] def __str__(self): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) @@ -253,11 +272,12 @@ def _accept_ssurt_if_not_redundant(self, ssurt): if not "accepts" in self.scope: self.scope["accepts"] = [] simple_rule_ssurts = ( - rule["ssurt"] for rule in self.scope["accepts"] - if set(rule.keys()) == {'ssurt'}) + rule["ssurt"] + for rule in self.scope["accepts"] + if set(rule.keys()) == {"ssurt"} + ) if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): - self.logger.info( - "adding ssurt %s to scope accept rules", ssurt) + self.logger.info("adding ssurt %s to scope accept rules", ssurt) self.scope["accepts"].append({"ssurt": ssurt}) def note_seed_redirect(self, url): @@ -266,14 +286,14 @@ def note_seed_redirect(self, url): # if http://foo.com/ redirects to https://foo.com/a/b/c let's also # put all of https://foo.com/ in scope - if (canon_seed_redirect.authority == canon_seed.authority - and canon_seed_redirect.scheme != canon_seed.scheme): + if ( + canon_seed_redirect.authority == canon_seed.authority + and canon_seed_redirect.scheme != canon_seed.scheme + ): canon_seed.scheme = canon_seed_redirect.scheme - self._accept_ssurt_if_not_redundant( - canon_seed.ssurt().decode('ascii')) + self._accept_ssurt_if_not_redundant(canon_seed.ssurt().decode("ascii")) - self._accept_ssurt_if_not_redundant( - canon_seed_redirect.ssurt().decode('ascii')) + self._accept_ssurt_if_not_redundant(canon_seed_redirect.ssurt().decode("ascii")) def extra_headers(self, page: Optional["Page"] = None): hdrs = {} @@ -281,28 +301,34 @@ def extra_headers(self, page: Optional["Page"] = None): temp_warcprox_meta = copy.deepcopy(self.warcprox_meta) if "blocks" in self.warcprox_meta: # delete temp_warcprox_meta's 'blocks' (they may be big!) - del temp_warcprox_meta['blocks'] + del temp_warcprox_meta["blocks"] # str-ify blocks - blocks_str = json.dumps(self.warcprox_meta['blocks'], separators=(',', ':')) + blocks_str = json.dumps( + self.warcprox_meta["blocks"], separators=(",", ":") + ) # encode(), compress, b64encode, decode() - temp_warcprox_meta['compressed_blocks'] = base64.b64encode(zlib.compress(blocks_str.encode())).decode() + temp_warcprox_meta["compressed_blocks"] = base64.b64encode( + zlib.compress(blocks_str.encode()) + ).decode() if page is not None: temp_warcprox_meta["metadata"]["hop_path"] = page.hop_path temp_warcprox_meta["metadata"]["brozzled_url"] = page.url temp_warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url - hdrs["Warcprox-Meta"] = json.dumps(temp_warcprox_meta, separators=(',', ':')) + hdrs["Warcprox-Meta"] = json.dumps( + temp_warcprox_meta, separators=(",", ":") + ) return hdrs def accept_reject_or_neither(self, url, parent_page=None): - ''' + """ Returns `True` (accepted), `False` (rejected), or `None` (no decision). `None` usually means rejected, unless `max_hops_off` comes into play. - ''' + """ if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) - if not url.scheme in (b'http', b'https'): + if not url.scheme in (b"http", b"https"): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False @@ -311,12 +337,14 @@ def accept_reject_or_neither(self, url, parent_page=None): if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) if parent_page.redirect_url: - try_parent_urls.append( - urlcanon.semantic(parent_page.redirect_url)) + try_parent_urls.append(urlcanon.semantic(parent_page.redirect_url)) # enforce max_hops - if (parent_page and "max_hops" in self.scope - and parent_page.hops_from_seed >= self.scope["max_hops"]): + if ( + parent_page + and "max_hops" in self.scope + and parent_page.hops_from_seed >= self.scope["max_hops"] + ): return False # enforce reject rules @@ -326,7 +354,7 @@ def accept_reject_or_neither(self, url, parent_page=None): if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): - return False + return False else: if rule.applies(url): return False @@ -337,7 +365,7 @@ def accept_reject_or_neither(self, url, parent_page=None): if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): - return True + return True else: if rule.applies(url): return True @@ -345,6 +373,7 @@ def accept_reject_or_neither(self, url, parent_page=None): # no decision if we reach here return None + class Page(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) table = "pages" @@ -398,4 +427,3 @@ def canon_url(self): if self._canon_hurl is None: self._canon_hurl = urlcanon.semantic(self.url) return str(self._canon_hurl) - diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 03d64b88..8b61841a 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -1,4 +1,4 @@ -''' +""" brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, loading from warcs still being written to, canonicalization rules matching brozzler conventions, support for screenshot: and thumbnail: urls @@ -16,10 +16,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import sys import logging + try: import pywb.apps.cli import pywb.cdx.cdxdomainspecific @@ -30,9 +31,11 @@ import pywb.rewrite.wburl except ImportError as e: logging.critical( - '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', - type(e).__name__, e) + '%s: %s\n\nYou might need to run "pip install ' + 'brozzler[easy]".\nSee README.rst for more information.', + type(e).__name__, + e, + ) sys.exit(1) import doublethink import rethinkdb as rdb @@ -43,6 +46,7 @@ r = rdb.RethinkDB() + class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): def __init__(self, servers, db, table): self.servers = servers @@ -67,70 +71,78 @@ def _gen_cdx_lines(self, rethink_results): # XXX inefficient, it gets parsed later, figure out how to # short-circuit this step and create the CDXObject directly blob = { - 'url': record['url'], - 'status': str(record['response_code']), - 'digest': record['sha1base32'], - 'length': str(record.get('record_length', '-')), - 'offset': str(record['offset']), - 'filename': record['filename'], + "url": record["url"], + "status": str(record["response_code"]), + "digest": record["sha1base32"], + "length": str(record.get("record_length", "-")), + "offset": str(record["offset"]), + "filename": record["filename"], } - if record['warc_type'] != 'revisit': - blob['mime'] = record['content_type'] or '-' + if record["warc_type"] != "revisit": + blob["mime"] = record["content_type"] or "-" else: - blob['mime'] = 'warc/revisit' + blob["mime"] = "warc/revisit" # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}' - cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format( - record['canon_surt'], record['timestamp'], - json.dumps(blob)) - yield cdx_line.encode('utf-8') + cdx_line = "{} {:%Y%m%d%H%M%S} {}".format( + record["canon_surt"], record["timestamp"], json.dumps(blob) + ) + yield cdx_line.encode("utf-8") def _query_rethinkdb(self, cdx_query): - start_key = cdx_query.key.decode('utf-8') - end_key = cdx_query.end_key.decode('utf-8') + start_key = cdx_query.key.decode("utf-8") + end_key = cdx_query.end_key.decode("utf-8") reql = self.rr.table(self.table).between( - [start_key[:150], r.minval], [end_key[:150], r.maxval], - index='abbr_canon_surt_timestamp', right_bound='closed') - reql = reql.order_by(index='abbr_canon_surt_timestamp') + [start_key[:150], r.minval], + [end_key[:150], r.maxval], + index="abbr_canon_surt_timestamp", + right_bound="closed", + ) + reql = reql.order_by(index="abbr_canon_surt_timestamp") # TODO support for POST, etc # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails reql = reql.filter( - lambda capture: r.expr( - ['WARCPROX_WRITE_RECORD','GET']).contains( - capture['http_method'])) + lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains( + capture["http_method"] + ) + ) reql = reql.filter( - lambda capture: (capture['canon_surt'] >= start_key) - & (capture['canon_surt'] < end_key)) + lambda capture: (capture["canon_surt"] >= start_key) + & (capture["canon_surt"] < end_key) + ) if cdx_query.limit: reql = reql.limit(cdx_query.limit) - logging.debug('rethinkdb query: %s', reql) + logging.debug("rethinkdb query: %s", reql) results = reql.run() return results + class TheGoodUrlCanonicalizer(object): - ''' + """ Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces surts with scheme and with trailing comma, and does not "massage" www.foo.org into foo.org. - ''' + """ + def __init__(self, surt_ordered=True): - '''We are always surt ordered (surt_ordered param is ignored)''' + """We are always surt ordered (surt_ordered param is ignored)""" self.surt_ordered = True def __call__(self, url): try: - key = urlcanon.semantic(url).surt().decode('ascii') + key = urlcanon.semantic(url).surt().decode("ascii") # logging.debug('%s -> %s', url, key) return key except Exception as e: return url def replace_default_canonicalizer(): - '''Replace parent class of CustomUrlCanonicalizer with this class.''' + """Replace parent class of CustomUrlCanonicalizer with this class.""" pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( - TheGoodUrlCanonicalizer,) + TheGoodUrlCanonicalizer, + ) def good_surts_from_default(default_surt): - ''' + """ Takes a standard surt without scheme and without trailing comma, and returns a list of "good" surts that together match the same set of urls. For example: @@ -144,59 +156,64 @@ def good_surts_from_default(default_surt): 'http://(com,example,www,)/path', 'https://(com,example,www,)/path'] - ''' - if default_surt == '': - return [''] + """ + if default_surt == "": + return [""] - parts = default_surt.split(')', 1) + parts = default_surt.split(")", 1) if len(parts) == 2: orig_host_part, path_part = parts good_surts = [ - 'http://(%s,)%s' % (orig_host_part, path_part), - 'https://(%s,)%s' % (orig_host_part, path_part), - 'http://(%s,www,)%s' % (orig_host_part, path_part), - 'https://(%s,www,)%s' % (orig_host_part, path_part), + "http://(%s,)%s" % (orig_host_part, path_part), + "https://(%s,)%s" % (orig_host_part, path_part), + "http://(%s,www,)%s" % (orig_host_part, path_part), + "https://(%s,www,)%s" % (orig_host_part, path_part), ] - else: # no path part + else: # no path part host_part = parts[0] good_surts = [ - 'http://(%s' % host_part, - 'https://(%s' % host_part, + "http://(%s" % host_part, + "https://(%s" % host_part, ] return good_surts def monkey_patch_dsrules_init(): orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ + def cdx_dsrule_init(self, url_prefix, rules): good_surts = [] - url_prefixes = [url_prefix] if isinstance( - url_prefix, str) else url_prefix + url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix for bad_surt in url_prefixes: good_surts.extend( - TheGoodUrlCanonicalizer.good_surts_from_default( - bad_surt)) - if 'match' in rules and 'regex' in rules['match']: - rules['match']['regex'] = r'https?://\(' + rules['match']['regex'] + TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt) + ) + if "match" in rules and "regex" in rules["match"]: + rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"] orig_init(self, good_surts, rules) + pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init + def support_in_progress_warcs(): - ''' + """ Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still being written to (warcs having ".open" suffix). This way if a cdx entry references foo.warc.gz, pywb will try both foo.warc.gz and foo.warc.gz.open. - ''' + """ _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__ + def _prefix_resolver_call(self, filename, cdx=None): raw_results = _orig_prefix_resolver_call(self, filename, cdx) results = [] for warc_path in raw_results: results.append(warc_path) - results.append('%s.open' % warc_path) + results.append("%s.open" % warc_path) return results + pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call + class SomeWbUrl(pywb.rewrite.wburl.WbUrl): def __init__(self, orig_url): import re @@ -211,14 +228,14 @@ def __init__(self, orig_url): pywb.rewrite.wburl.BaseWbUrl.__init__(self) if six.PY2 and isinstance(orig_url, six.text_type): - orig_url = orig_url.encode('utf-8') + orig_url = orig_url.encode("utf-8") orig_url = quote(orig_url) self._original_url = orig_url if not self._init_query(orig_url): if not self._init_replay(orig_url): - raise Exception('Invalid WbUrl: ', orig_url) + raise Exception("Invalid WbUrl: ", orig_url) new_uri = WbUrl.to_uri(self.url) @@ -227,21 +244,24 @@ def __init__(self, orig_url): self.url = new_uri # begin brozzler changes - if (self.url.startswith('urn:') or self.url.startswith('screenshot:') - or self.url.startswith('thumbnail:')): + if ( + self.url.startswith("urn:") + or self.url.startswith("screenshot:") + or self.url.startswith("thumbnail:") + ): return # end brozzler changes # protocol agnostic url -> http:// # no protocol -> http:// - #inx = self.url.find('://') + # inx = self.url.find('://') inx = -1 m = self.SCHEME_RX.match(self.url) if m: inx = m.span(1)[0] - #if inx < 0: - # check for other partially encoded variants + # if inx < 0: + # check for other partially encoded variants # m = self.PARTIAL_ENC_RX.match(self.url) # if m: # len_ = len(m.group(0)) @@ -253,27 +273,31 @@ def __init__(self, orig_url): self.url = self.DEFAULT_SCHEME + self.url else: inx += 2 - if inx < len(self.url) and self.url[inx] != '/': - self.url = self.url[:inx] + '/' + self.url[inx:] + if inx < len(self.url) and self.url[inx] != "/": + self.url = self.url[:inx] + "/" + self.url[inx:] + def _get_wburl_type(self): return SomeWbUrl + def monkey_patch_wburl(): pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type + class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli): def _extend_parser(self, arg_parser): super()._extend_parser(arg_parser) - arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex + arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter - arg_parser.epilog = ''' + arg_parser.epilog = """ Run pywb like so: $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback See README.rst for more information. -''' +""" + # copied and pasted from cdxdomainspecific.py, only changes are commented as # such below @@ -284,7 +308,7 @@ def _fuzzy_query_call(self, query): matched_rule = None - urlkey = to_native_str(query.key, 'utf-8') + urlkey = to_native_str(query.key, "utf-8") url = query.url filter_ = query.filters output = query.output @@ -306,42 +330,42 @@ def _fuzzy_query_call(self, query): if not matched_rule: return None - repl = '?' + repl = "?" if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: - url = url[:inx + len(repl)] + url = url[: inx + len(repl)] # begin brozzler changes - if matched_rule.match_type == 'domain': + if matched_rule.match_type == "domain": orig_split_url = urlsplit(url) # remove the subdomain, path, query and fragment - host = orig_split_url.netloc.split('.', 1)[1] - new_split_url = (orig_split_url.scheme, host, '', '', '') + host = orig_split_url.netloc.split(".", 1)[1] + new_split_url = (orig_split_url.scheme, host, "", "", "") url = urlunsplit(new_split_url) # end brozzler changes params = query.params - params.update({'url': url, - 'matchType': matched_rule.match_type, - 'filter': filter_}) + params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_}) - if 'reverse' in params: - del params['reverse'] + if "reverse" in params: + del params["reverse"] - if 'closest' in params: - del params['closest'] + if "closest" in params: + del params["closest"] - if 'end_key' in params: - del params['end_key'] + if "end_key" in params: + del params["end_key"] return params + def monkey_patch_fuzzy_query(): pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call + # copied and pasted from pywb/utils/canonicalize.py, only changes are commented # as such def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None): @@ -361,54 +385,56 @@ def inc_last_char(x): start_key = url_canon(url) - if match_type == 'exact': - end_key = start_key + '!' + if match_type == "exact": + end_key = start_key + "!" - elif match_type == 'prefix': + elif match_type == "prefix": # add trailing slash if url has it - if url.endswith('/') and not start_key.endswith('/'): - start_key += '/' + if url.endswith("/") and not start_key.endswith("/"): + start_key += "/" end_key = inc_last_char(start_key) - elif match_type == 'host': + elif match_type == "host": if surt_ordered: - host = start_key.split(')/')[0] + host = start_key.split(")/")[0] - start_key = host + ')/' - end_key = host + '*' + start_key = host + ")/" + end_key = host + "*" else: host = urlparse.urlsplit(url).netloc - start_key = host + '/' - end_key = host + '0' + start_key = host + "/" + end_key = host + "0" - elif match_type == 'domain': + elif match_type == "domain": if not surt_ordered: - msg = 'matchType=domain unsupported for non-surt' + msg = "matchType=domain unsupported for non-surt" raise UrlCanonicalizeException(msg) - host = start_key.split(')/')[0] + host = start_key.split(")/")[0] # if tld, use com, as start_key # otherwise, stick with com,example)/ - if ',' not in host: - start_key = host + ',' + if "," not in host: + start_key = host + "," else: - start_key = host + ')/' + start_key = host + ")/" # begin brozzler changes - end_key = host + '~' + end_key = host + "~" # end brozzler changes else: - raise UrlCanonicalizeException('Invalid match_type: ' + match_type) + raise UrlCanonicalizeException("Invalid match_type: " + match_type) return (start_key, end_key) + def monkey_patch_calc_search_range(): pywb.utils.canonicalize.calc_search_range = _calc_search_range pywb.cdx.query.calc_search_range = _calc_search_range + def main(argv=sys.argv): brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() @@ -417,7 +443,10 @@ def main(argv=sys.argv): brozzler.pywb.monkey_patch_fuzzy_query() brozzler.pywb.monkey_patch_calc_search_range() wayback_cli = BrozzlerWaybackCli( - args=argv[1:], default_port=8880, - desc=('brozzler-wayback - pywb wayback (monkey-patched for use ' - 'with brozzler)')) + args=argv[1:], + default_port=8880, + desc=( + "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)" + ), + ) wayback_cli.run() diff --git a/brozzler/robots.py b/brozzler/robots.py index 4122093c..744c9968 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -1,4 +1,4 @@ -''' +""" brozzler/robots.py - robots.txt support Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring @@ -20,7 +20,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import json import logging @@ -34,48 +34,60 @@ # monkey-patch reppy to do substring user-agent matching, see top of file reppy.Utility.short_user_agent = lambda strng: strng + + def _reppy_rules_getitem(self, agent): - ''' + """ Find the user-agent token matching the supplied full user-agent, using a case-insensitive substring search. - ''' + """ lc_agent = agent.lower() for s in self.agents: if s in lc_agent: return self.agents[s] - return self.agents.get('*') + return self.agents.get("*") + + reppy.parser.Rules.__getitem__ = _reppy_rules_getitem + class _SessionRaiseOn420(requests.Session): timeout = 60 + def get(self, url, *args, **kwargs): res = super().get(url, timeout=self.timeout, *args, **kwargs) - if res.status_code == 420 and 'warcprox-meta' in res.headers: + if res.status_code == 420 and "warcprox-meta" in res.headers: raise brozzler.ReachedLimit( - warcprox_meta=json.loads(res.headers['warcprox-meta']), - http_payload=res.text) + warcprox_meta=json.loads(res.headers["warcprox-meta"]), + http_payload=res.text, + ) else: return res + _robots_caches = {} # {site_id:reppy.cache.RobotsCache} + + def _robots_cache(site, proxy=None): if not site.id in _robots_caches: req_sesh = _SessionRaiseOn420() - req_sesh.verify = False # ignore cert errors + req_sesh.verify = False # ignore cert errors if proxy: proxie = "http://%s" % proxy - req_sesh.proxies = {"http":proxie,"https":proxie} + req_sesh.proxies = {"http": proxie, "https": proxie} if site.extra_headers(): req_sesh.headers.update(site.extra_headers()) if site.user_agent: - req_sesh.headers['User-Agent'] = site.user_agent + req_sesh.headers["User-Agent"] = site.user_agent _robots_caches[site.id] = reppy.cache.RobotsCache( - session=req_sesh, disallow_forbidden=False) + session=req_sesh, disallow_forbidden=False + ) return _robots_caches[site.id] + def is_permitted_by_robots(site, url, proxy=None): - ''' + """ Checks if `url` is permitted by robots.txt. Treats any kind of error fetching robots.txt as "allow all". See @@ -89,25 +101,28 @@ def is_permitted_by_robots(site, url, proxy=None): Raises: brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit requests.exceptions.ProxyError: if the proxy is down - ''' + """ if site.ignore_robots: return True try: - result = _robots_cache(site, proxy).allowed( - url, site.user_agent or "brozzler") + result = _robots_cache(site, proxy).allowed(url, site.user_agent or "brozzler") return result except Exception as e: if isinstance(e, reppy.exceptions.ServerError) and isinstance( - e.args[0], brozzler.ReachedLimit): + e.args[0], brozzler.ReachedLimit + ): raise e.args[0] - elif hasattr(e, 'args') and isinstance( - e.args[0], requests.exceptions.ProxyError): + elif hasattr(e, "args") and isinstance( + e.args[0], requests.exceptions.ProxyError + ): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: logging.warning( - "returning true (permitted) after problem fetching " - "robots.txt for %r: %r", url, e) + "returning true (permitted) after problem fetching " + "robots.txt for %r: %r", + url, + e, + ) return True - diff --git a/brozzler/worker.py b/brozzler/worker.py index 6399040b..86977cf5 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -1,4 +1,4 @@ -''' +""" brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs yt-dlp on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier @@ -16,7 +16,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import brozzler @@ -39,6 +39,7 @@ r = rdb.RethinkDB() + class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -50,13 +51,26 @@ class BrozzlerWorker: SITE_SESSION_MINUTES = 15 def __init__( - self, frontier, service_registry=None, max_browsers=1, - chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, - skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, - page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, - download_throughput=-1, stealth=False, - window_height=900, window_width=1400): + self, + frontier, + service_registry=None, + max_browsers=1, + chrome_exe="chromium-browser", + warcprox_auto=False, + proxy=None, + skip_extract_outlinks=False, + skip_visit_hashtags=False, + skip_youtube_dl=False, + simpler404=False, + screenshot_full_page=False, + page_timeout=300, + behavior_timeout=900, + extract_outlinks_timeout=60, + download_throughput=-1, + stealth=False, + window_height=900, + window_width=1400, + ): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -79,7 +93,8 @@ def __init__( self._stealth = stealth self._browser_pool = brozzler.browser.BrowserPool( - max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) + max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True + ) self._browsing_threads = set() self._browsing_threads_lock = threading.Lock() @@ -88,24 +103,32 @@ def __init__( self._shutdown = threading.Event() def _choose_warcprox(self): - warcproxes = self._service_registry.available_services('warcprox') + warcproxes = self._service_registry.available_services("warcprox") if not warcproxes: return None # .group('proxy').count() makes this query about 99% more efficient - reql = self._frontier.rr.table('sites').between( - ['ACTIVE', r.minval], ['ACTIVE', r.maxval], - index='sites_last_disclaimed').group('proxy').count() - # returns results like + reql = ( + self._frontier.rr.table("sites") + .between( + ["ACTIVE", r.minval], + ["ACTIVE", r.maxval], + index="sites_last_disclaimed", + ) + .group("proxy") + .count() + ) + # returns results like # { # "wbgrp-svc030.us.archive.org:8000": 148, # "wbgrp-svc030.us.archive.org:8001": 145 # } proxy_scoreboard = dict(reql.run()) for warcprox in warcproxes: - address = '%s:%s' % (warcprox['host'], warcprox['port']) - warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0) - warcproxes.sort(key=lambda warcprox: ( - warcprox['assigned_sites'], warcprox['load'])) + address = "%s:%s" % (warcprox["host"], warcprox["port"]) + warcprox["assigned_sites"] = proxy_scoreboard.get(address, 0) + warcproxes.sort( + key=lambda warcprox: (warcprox["assigned_sites"], warcprox["load"]) + ) # XXX make this heuristic more advanced? return warcproxes[0] @@ -118,13 +141,15 @@ def _proxy_for(self, site): svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( - 'no available instances of warcprox in the service ' - 'registry') - site.proxy = '%s:%s' % (svc['host'], svc['port']) + "no available instances of warcprox in the service " "registry" + ) + site.proxy = "%s:%s" % (svc["host"], svc["port"]) site.save() self.logger.info( - 'chose warcprox instance %r from service registry for %r', - site.proxy, site) + "chose warcprox instance %r from service registry for %r", + site.proxy, + site, + ) return site.proxy return None @@ -132,14 +157,16 @@ def _using_warcprox(self, site): if self._proxy: if self._proxy_is_warcprox is None: try: - response = requests.get('http://%s/status' % self._proxy) + response = requests.get("http://%s/status" % self._proxy) status = json.loads(response.text) - self._proxy_is_warcprox = (status['role'] == 'warcprox') + self._proxy_is_warcprox = status["role"] == "warcprox" except Exception as e: self._proxy_is_warcprox = False logging.info( - '%s %s warcprox', self._proxy, - 'IS' if self._proxy_is_warcprox else 'IS NOT') + "%s %s warcprox", + self._proxy, + "IS" if self._proxy_is_warcprox else "IS NOT", + ) return self._proxy_is_warcprox else: # I should have commented when I originally wrote this code, but I @@ -148,13 +175,20 @@ def _using_warcprox(self, site): return bool(site.proxy or self._warcprox_auto) def _warcprox_write_record( - self, warcprox_address, url, warc_type, content_type, - payload, extra_headers=None): - headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"} + self, + warcprox_address, + url, + warc_type, + content_type, + payload, + extra_headers=None, + ): + headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"} if extra_headers: headers.update(extra_headers) - request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD", - headers=headers, data=payload) + request = urllib.request.Request( + url, method="WARCPROX_WRITE_RECORD", headers=headers, data=payload + ) # XXX setting request.type="http" is a hack to stop urllib from trying # to tunnel if url is https @@ -165,26 +199,31 @@ def _warcprox_write_record( with urllib.request.urlopen(request, timeout=600) as response: if response.getcode() != 204: self.logger.warning( - 'got "%s %s" response on warcprox ' - 'WARCPROX_WRITE_RECORD request (expected 204)', - response.getcode(), response.reason) + 'got "%s %s" response on warcprox ' + "WARCPROX_WRITE_RECORD request (expected 204)", + response.getcode(), + response.reason, + ) return request, response except urllib.error.HTTPError as e: self.logger.warning( - 'got "%s %s" response on warcprox ' - 'WARCPROX_WRITE_RECORD request (expected 204)', - e.getcode(), e.info()) + 'got "%s %s" response on warcprox ' + "WARCPROX_WRITE_RECORD request (expected 204)", + e.getcode(), + e.info(), + ) return request, None except urllib.error.URLError as e: raise brozzler.ProxyError( - 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e + "proxy error on WARCPROX_WRITE_RECORD %s" % url + ) from e except ConnectionError as e: raise brozzler.ProxyError( - 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e + "proxy error on WARCPROX_WRITE_RECORD %s" % url + ) from e def thumb_jpeg(self, full_jpeg): - """Create JPEG thumbnail. - """ + """Create JPEG thumbnail.""" img = PIL.Image.open(io.BytesIO(full_jpeg)) thumb_width = 300 thumb_height = (thumb_width / img.size[0]) * img.size[1] @@ -193,8 +232,15 @@ def thumb_jpeg(self, full_jpeg): img.save(out, "jpeg", quality=95) return out.getbuffer() - def brozzle_page(self, browser, site, page, on_screenshot=None, - on_request=None, enable_youtube_dl=True): + def brozzle_page( + self, + browser, + site, + page, + on_screenshot=None, + on_request=None, + enable_youtube_dl=True, + ): self.logger.info("brozzling {}".format(page)) ydl_fetches = None outlinks = set() @@ -208,31 +254,38 @@ def brozzle_page(self, browser, site, page, on_screenshot=None, except brozzler.ProxyError: raise except Exception as e: - if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 - and hasattr(e.exc_info[1], 'code') - and e.exc_info[1].code == 430): + if ( + hasattr(e, "exc_info") + and len(e.exc_info) >= 2 + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 430 + ): self.logger.info( - 'youtube-dl got %s %s processing %s', - e.exc_info[1].code, e.exc_info[1].msg, page.url) + "youtube-dl got %s %s processing %s", + e.exc_info[1].code, + e.exc_info[1].msg, + page.url, + ) else: self.logger.error( - 'youtube_dl raised exception on %s', page, - exc_info=True) + "youtube_dl raised exception on %s", page, exc_info=True + ) if self._needs_browsing(page, ydl_fetches): - self.logger.info('needs browsing: %s', page) + self.logger.info("needs browsing: %s", page) try: browser_outlinks = self._browse_page( - browser, site, page, on_screenshot, on_request) + browser, site, page, on_screenshot, on_request + ) outlinks.update(browser_outlinks) except brozzler.PageInterstitialShown: - self.logger.info('page interstitial shown (http auth): %s', page) + self.logger.info("page interstitial shown (http auth): %s", page) else: if not self._already_fetched(page, ydl_fetches): - self.logger.info('needs fetch: %s', page) + self.logger.info("needs fetch: %s", page) self._fetch_url(site, page=page) else: - self.logger.info('already fetched: %s', page) + self.logger.info("already fetched: %s", page) return outlinks @@ -242,85 +295,103 @@ def _on_screenshot(screenshot_jpeg): on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( - "sending WARCPROX_WRITE_RECORD request to %s with " - "screenshot for %s", self._proxy_for(site), page) + "sending WARCPROX_WRITE_RECORD request to %s with " + "screenshot for %s", + self._proxy_for(site), + page, + ) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( - warcprox_address=self._proxy_for(site), - url="screenshot:%s" % str(urlcanon.semantic(page.url)), - warc_type="resource", content_type="image/jpeg", - payload=screenshot_jpeg, - extra_headers=site.extra_headers(page)) + warcprox_address=self._proxy_for(site), + url="screenshot:%s" % str(urlcanon.semantic(page.url)), + warc_type="resource", + content_type="image/jpeg", + payload=screenshot_jpeg, + extra_headers=site.extra_headers(page), + ) self._warcprox_write_record( - warcprox_address=self._proxy_for(site), - url="thumbnail:%s" % str(urlcanon.semantic(page.url)), - warc_type="resource", content_type="image/jpeg", - payload=thumbnail_jpeg, - extra_headers=site.extra_headers(page)) + warcprox_address=self._proxy_for(site), + url="thumbnail:%s" % str(urlcanon.semantic(page.url)), + warc_type="resource", + content_type="image/jpeg", + payload=thumbnail_jpeg, + extra_headers=site.extra_headers(page), + ) def _on_response(chrome_msg): - if ('params' in chrome_msg - and 'response' in chrome_msg['params'] - and 'mimeType' in chrome_msg['params']['response'] - and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') - # skip manifests of DASH segmented video - - # see https://github.com/internetarchive/brozzler/pull/70 - and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' - and chrome_msg['params']['response'].get('status') in (200, 206)): + if ( + "params" in chrome_msg + and "response" in chrome_msg["params"] + and "mimeType" in chrome_msg["params"]["response"] + and chrome_msg["params"]["response"] + .get("mimeType", "") + .startswith("video/") + # skip manifests of DASH segmented video - + # see https://github.com/internetarchive/brozzler/pull/70 + and chrome_msg["params"]["response"]["mimeType"] + != "video/vnd.mpeg.dash.mpd" + and chrome_msg["params"]["response"].get("status") in (200, 206) + ): video = { - 'blame': 'browser', - 'url': chrome_msg['params']['response'].get('url'), - 'response_code': chrome_msg['params']['response']['status'], - 'content-type': chrome_msg['params']['response']['mimeType'], + "blame": "browser", + "url": chrome_msg["params"]["response"].get("url"), + "response_code": chrome_msg["params"]["response"]["status"], + "content-type": chrome_msg["params"]["response"]["mimeType"], } response_headers = CaseInsensitiveDict( - chrome_msg['params']['response']['headers']) - if 'content-length' in response_headers: - video['content-length'] = int(response_headers['content-length']) - if 'content-range' in response_headers: - video['content-range'] = response_headers['content-range'] - logging.debug('embedded video %s', video) - if not 'videos' in page: + chrome_msg["params"]["response"]["headers"] + ) + if "content-length" in response_headers: + video["content-length"] = int(response_headers["content-length"]) + if "content-range" in response_headers: + video["content-range"] = response_headers["content-range"] + logging.debug("embedded video %s", video) + if not "videos" in page: page.videos = [] page.videos.append(video) sw_fetched = set() + def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 - self.logger.trace('%r', chrome_msg) - if chrome_msg.get('params', {}).get('versions'): - url = chrome_msg.get('params', {}).get('versions')[0]\ - .get('scriptURL') + self.logger.trace("%r", chrome_msg) + if chrome_msg.get("params", {}).get("versions"): + url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL") if url and url not in sw_fetched: - self.logger.info('fetching service worker script %s', url) + self.logger.info("fetching service worker script %s", url) self._fetch_url(site, url=url) sw_fetched.add(url) if not browser.is_running(): browser.start( - proxy=self._proxy_for(site), - cookie_db=site.get('cookie_db'), - window_height=self._window_height, - window_width=self._window_width) + proxy=self._proxy_for(site), + cookie_db=site.get("cookie_db"), + window_height=self._window_height, + window_width=self._window_width, + ) final_page_url, outlinks = browser.browse_page( - page.url, extra_headers=site.extra_headers(page), - behavior_parameters=site.get('behavior_parameters'), - username=site.get('username'), password=site.get('password'), - user_agent=site.get('user_agent'), - on_screenshot=_on_screenshot, on_response=_on_response, - on_request=on_request, - on_service_worker_version_updated=_on_service_worker_version_updated, - hashtags=page.hashtags, - skip_extract_outlinks=self._skip_extract_outlinks, - skip_visit_hashtags=self._skip_visit_hashtags, - skip_youtube_dl=self._skip_youtube_dl, - simpler404=self._simpler404, - screenshot_full_page=self._screenshot_full_page, - page_timeout=self._page_timeout, - behavior_timeout=self._behavior_timeout, - extract_outlinks_timeout=self._extract_outlinks_timeout, - download_throughput=self._download_throughput, - stealth=self._stealth) + page.url, + extra_headers=site.extra_headers(page), + behavior_parameters=site.get("behavior_parameters"), + username=site.get("username"), + password=site.get("password"), + user_agent=site.get("user_agent"), + on_screenshot=_on_screenshot, + on_response=_on_response, + on_request=on_request, + on_service_worker_version_updated=_on_service_worker_version_updated, + hashtags=page.hashtags, + skip_extract_outlinks=self._skip_extract_outlinks, + skip_visit_hashtags=self._skip_visit_hashtags, + skip_youtube_dl=self._skip_youtube_dl, + simpler404=self._simpler404, + screenshot_full_page=self._screenshot_full_page, + page_timeout=self._page_timeout, + behavior_timeout=self._behavior_timeout, + extract_outlinks_timeout=self._extract_outlinks_timeout, + download_throughput=self._download_throughput, + stealth=self._stealth, + ) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks @@ -328,22 +399,21 @@ def _on_service_worker_version_updated(chrome_msg): def _fetch_url(self, site, url=None, page=None): proxies = None if page: - url=page.url + url = page.url if self._proxy_for(site): proxies = { - 'http': 'http://%s' % self._proxy_for(site), - 'https': 'http://%s' % self._proxy_for(site), + "http": "http://%s" % self._proxy_for(site), + "https": "http://%s" % self._proxy_for(site), } - self.logger.info('fetching %s', url) + self.logger.info("fetching %s", url) try: # response is ignored requests.get( - url, proxies=proxies, headers=site.extra_headers(page), - verify=False) + url, proxies=proxies, headers=site.extra_headers(page), verify=False + ) except requests.exceptions.ProxyError as e: - raise brozzler.ProxyError( - 'proxy error fetching %s' % url) from e + raise brozzler.ProxyError("proxy error fetching %s" % url) from e def _needs_browsing(self, page, ydl_fetches): if ydl_fetches: @@ -351,8 +421,10 @@ def _needs_browsing(self, page, ydl_fetches): if not final_bounces: return True for txn in final_bounces: - if txn['response_headers'].get_content_type() in [ - 'text/html', 'application/xhtml+xml']: + if txn["response_headers"].get_content_type() in [ + "text/html", + "application/xhtml+xml", + ]: return True return False else: @@ -361,14 +433,13 @@ def _needs_browsing(self, page, ydl_fetches): def _already_fetched(self, page, ydl_fetches): if ydl_fetches: for fetch in ydl.final_bounces(ydl_fetches, page.url): - if (fetch['method'] == 'GET' and fetch['response_code'] == 200): + if fetch["method"] == "GET" and fetch["response_code"] == 200: return True return False def brozzle_site(self, browser, site): try: - site.last_claimed_by = '%s:%s' % ( - socket.gethostname(), browser.chrome.port) + site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port) site.save() start = time.time() page = None @@ -377,28 +448,28 @@ def brozzle_site(self, browser, site): # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 self.logger.info( - "brozzling site (proxy=%r) %s", - self._proxy_for(site), site) + "brozzling site (proxy=%r) %s", self._proxy_for(site), site + ) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) - page = self._frontier.claim_page(site, "%s:%s" % ( - socket.gethostname(), browser.chrome.port)) + page = self._frontier.claim_page( + site, "%s:%s" % (socket.gethostname(), browser.chrome.port) + ) - if (page.needs_robots_check and - not brozzler.is_permitted_by_robots( - site, page.url, self._proxy_for(site))): + if page.needs_robots_check and not brozzler.is_permitted_by_robots( + site, page.url, self._proxy_for(site) + ): logging.warning("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page( - browser, site, page, - enable_youtube_dl=not self._skip_youtube_dl) + browser, site, page, enable_youtube_dl=not self._skip_youtube_dl + ) self._frontier.completed_page(site, page) - self._frontier.scope_and_schedule_outlinks( - site, page, outlinks) + self._frontier.scope_and_schedule_outlinks(site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() @@ -418,31 +489,36 @@ def brozzle_site(self, browser, site): except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( - 'proxy error (site.proxy=%s), will try to choose a ' - 'healthy instance next time site is brozzled: %s', - site.proxy, e) + "proxy error (site.proxy=%s), will try to choose a " + "healthy instance next time site is brozzled: %s", + site.proxy, + e, + ) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time - logging.error( - 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) + logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1) except: self.logger.error( - 'unexpected exception site=%r page=%r', site, page, - exc_info=True) + "unexpected exception site=%r page=%r", site, page, exc_info=True + ) if page: page.failed_attempts = (page.failed_attempts or 0) + 1 if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: self.logger.info( - 'marking page "completed" after %s unexpected ' - 'exceptions attempting to brozzle %s', - page.failed_attempts, page) + 'marking page "completed" after %s unexpected ' + "exceptions attempting to brozzle %s", + page.failed_attempts, + page, + ) self._frontier.completed_page(site, page) page = None finally: if start: - site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start + site.active_brozzling_time = ( + (site.active_brozzling_time or 0) + time.time() - start + ) self._frontier.disclaim_site(site, page) def _brozzle_site_thread_target(self, browser, site): @@ -462,21 +538,25 @@ def _service_heartbeat(self): "role": "brozzler-worker", "ttl": self.HEARTBEAT_INTERVAL * 3, } - status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size + status_info["load"] = ( + 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size + ) status_info["browser_pool_size"] = self._browser_pool.size status_info["browsers_in_use"] = self._browser_pool.num_in_use() try: self.status_info = self._service_registry.heartbeat(status_info) - self.logger.trace( - "status in service registry: %s", self.status_info) + self.logger.trace("status in service registry: %s", self.status_info) except r.ReqlError as e: self.logger.error( - "failed to send heartbeat and update service registry " - "with info %s: %s", status_info, e) + "failed to send heartbeat and update service registry " + "with info %s: %s", + status_info, + e, + ) def _service_heartbeat_if_due(self): - '''Sends service registry heartbeat if due''' + """Sends service registry heartbeat if due""" due = False if self._service_registry: if not hasattr(self, "status_info"): @@ -489,15 +569,16 @@ def _service_heartbeat_if_due(self): self._service_heartbeat() def _start_browsing_some_sites(self): - ''' + """ Starts browsing some sites. Raises: NoBrowsersAvailable if none available - ''' + """ # acquire_multi() raises NoBrowsersAvailable if none available browsers = self._browser_pool.acquire_multi( - (self._browser_pool.num_available() + 1) // 2) + (self._browser_pool.num_available() + 1) // 2 + ) try: sites = self._frontier.claim_sites(len(browsers)) except: @@ -507,10 +588,11 @@ def _start_browsing_some_sites(self): for i in range(len(browsers)): if i < len(sites): th = threading.Thread( - target=self._brozzle_site_thread_target, - args=(browsers[i], sites[i]), - name="BrozzlingThread:%s" % browsers[i].chrome.port, - daemon=True) + target=self._brozzle_site_thread_target, + args=(browsers[i], sites[i]), + name="BrozzlingThread:%s" % browsers[i].chrome.port, + daemon=True, + ) with self._browsing_threads_lock: self._browsing_threads.add(th) th.start() @@ -519,7 +601,8 @@ def _start_browsing_some_sites(self): def run(self): self.logger.notice( - 'brozzler %s - brozzler-worker starting', brozzler.__version__) + "brozzler %s - brozzler-worker starting", brozzler.__version__ + ) last_nothing_to_claim = 0 try: while not self._shutdown.is_set(): @@ -528,39 +611,38 @@ def run(self): try: self._start_browsing_some_sites() except brozzler.browser.NoBrowsersAvailable: - logging.trace( - "all %s browsers are in use", - self._max_browsers) + logging.trace("all %s browsers are in use", self._max_browsers) except brozzler.NothingToClaim: last_nothing_to_claim = time.time() logging.trace( - "nothing to claim, all available active sites " - "are already claimed by a brozzler worker") + "nothing to claim, all available active sites " + "are already claimed by a brozzler worker" + ) time.sleep(0.5) self.logger.notice("shutdown requested") except r.ReqlError as e: self.logger.error( - "caught rethinkdb exception, will try to proceed", - exc_info=True) + "caught rethinkdb exception, will try to proceed", exc_info=True + ) except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except: self.logger.critical( - "thread exiting due to unexpected exception", - exc_info=True) + "thread exiting due to unexpected exception", exc_info=True + ) finally: if self._service_registry and hasattr(self, "status_info"): try: self._service_registry.unregister(self.status_info["id"]) except: self.logger.error( - "failed to unregister from service registry", - exc_info=True) + "failed to unregister from service registry", exc_info=True + ) self.logger.info( - 'shutting down %s brozzling threads', - len(self._browsing_threads)) + "shutting down %s brozzling threads", len(self._browsing_threads) + ) with self._browsing_threads_lock: for th in self._browsing_threads: if th.is_alive(): @@ -575,11 +657,10 @@ def start(self): with self._start_stop_lock: if self._thread: self.logger.warning( - 'ignoring start request because self._thread is ' - 'not None') + "ignoring start request because self._thread is " "not None" + ) return - self._thread = threading.Thread( - target=self.run, name="BrozzlerWorker") + self._thread = threading.Thread(target=self.run, name="BrozzlerWorker") self._thread.start() def shutdown_now(self): @@ -590,4 +671,3 @@ def stop(self): def is_alive(self): return self._thread and self._thread.is_alive() - diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2eec17cd..4281d4a5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,4 +1,4 @@ -''' +""" brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler Copyright (C) 2023 Internet Archive @@ -14,7 +14,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import logging import yt_dlp @@ -31,6 +31,7 @@ thread_local = threading.local() + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers @@ -43,6 +44,7 @@ def _http_request(self, req): req.add_header(h, v) return req + class YoutubeDLSpy(urllib.request.BaseHandler): logger = logging.getLogger(__module__ + "." + __qualname__) @@ -51,10 +53,10 @@ def __init__(self): def _http_response(self, request, response): fetch = { - 'url': request.full_url, - 'method': request.get_method(), - 'response_code': response.code, - 'response_headers': response.headers, + "url": request.full_url, + "method": request.get_method(), + "response_code": response.code, + "response_headers": response.headers, } self.fetches.append(fetch) return response @@ -64,6 +66,7 @@ def _http_response(self, request, response): def reset(self): self.fetches = [] + def final_bounces(fetches, url): """ Resolves redirect chains in `fetches` and returns a list of fetches @@ -73,26 +76,28 @@ def final_bounces(fetches, url): """ redirects = {} for fetch in fetches: - # XXX check http status 301,302,303,307? check for "uri" header - # as well as "location"? see urllib.request.HTTPRedirectHandler - if 'location' in fetch['response_headers']: - redirects[fetch['url']] = fetch + # XXX check http status 301,302,303,307? check for "uri" header + # as well as "location"? see urllib.request.HTTPRedirectHandler + if "location" in fetch["response_headers"]: + redirects[fetch["url"]] = fetch final_url = url while final_url in redirects: fetch = redirects.pop(final_url) final_url = urllib.parse.urljoin( - fetch['url'], fetch['response_headers']['location']) + fetch["url"], fetch["response_headers"]["location"] + ) final_bounces = [] for fetch in fetches: - if fetch['url'] == final_url: + if fetch["url"] == final_url: final_bounces.append(fetch) return final_bounces + def _build_youtube_dl(worker, destdir, site, page): - ''' + """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. The `YoutubeDL` instance does a few special brozzler-specific things: @@ -109,7 +114,7 @@ def _build_youtube_dl(worker, destdir, site, page): Returns: a yt-dlp `yt_dlp.YoutubeDL` instance - ''' + """ class _YoutubeDL(yt_dlp.YoutubeDL): logger = logging.getLogger(__module__ + "." + __qualname__) @@ -117,31 +122,38 @@ class _YoutubeDL(yt_dlp.YoutubeDL): def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) - if ie_result.get('_type') == 'playlist': - self.logger.info( - 'extractor %r found playlist in %s', ie.IE_NAME, url) - if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}: + if ie_result.get("_type") == "playlist": + self.logger.info("extractor %r found playlist in %s", ie.IE_NAME, url) + if ie.IE_NAME in { + "youtube:playlist", + "youtube:tab", + "soundcloud:user", + "instagram:user", + }: # At this point ie_result['entries'] is an iterator that # will fetch more metadata from youtube to list all the # videos. We unroll that iterator here partly because # otherwise `process_ie_result()` will clobber it, and we # use it later to extract the watch pages as outlinks. try: - ie_result['entries_no_dl'] = list(ie_result['entries']) + ie_result["entries_no_dl"] = list(ie_result["entries"]) except Exception as e: self.logger.warning( - "failed to unroll ie_result['entries']? for %s, %s; exception %s", - ie.IE_NAME, url, e) - ie_result['entries_no_dl'] =[] - ie_result['entries'] = [] + "failed to unroll ie_result['entries']? for %s, %s; exception %s", + ie.IE_NAME, + url, + e, + ) + ie_result["entries_no_dl"] = [] + ie_result["entries"] = [] self.logger.info( - 'not downloading %s media files from this ' - 'playlist because we expect to capture them from ' - 'individual watch/track/detail pages', - len(ie_result['entries_no_dl'])) + "not downloading %s media files from this " + "playlist because we expect to capture them from " + "individual watch/track/detail pages", + len(ie_result["entries_no_dl"]), + ) else: - self.logger.info( - 'extractor %r found a download in %s', ie.IE_NAME, url) + self.logger.info("extractor %r found a download in %s", ie.IE_NAME, url) def _push_video_to_warcprox(self, site, info_dict, postprocessor): # 220211 update: does yt-dlp supply content-type? no, not as such @@ -150,73 +162,96 @@ def _push_video_to_warcprox(self, site, info_dict, postprocessor): # youtube-dl produces a stitched-up video that /usr/bin/file fails # to identify (says "application/octet-stream"). `ffprobe` doesn't # give us a mimetype. - if info_dict.get('ext') == 'mp4': - mimetype = 'video/mp4' + if info_dict.get("ext") == "mp4": + mimetype = "video/mp4" else: try: import magic - mimetype = magic.from_file(info_dict['filepath'], mime=True) + + mimetype = magic.from_file(info_dict["filepath"], mime=True) except ImportError as e: - mimetype = 'video/%s' % info_dict['ext'] - self.logger.warning( - 'guessing mimetype %s because %r', mimetype, e) + mimetype = "video/%s" % info_dict["ext"] + self.logger.warning("guessing mimetype %s because %r", mimetype, e) # youtube watch page postprocessor is MoveFiles - if postprocessor == 'FixupM3u8' or postprocessor == 'Merger': - url = 'youtube-dl:%05d:%s' % ( - info_dict.get('playlist_index') or 1, - info_dict['webpage_url']) + if postprocessor == "FixupM3u8" or postprocessor == "Merger": + url = "youtube-dl:%05d:%s" % ( + info_dict.get("playlist_index") or 1, + info_dict["webpage_url"], + ) else: - url = info_dict.get('url', '') + url = info_dict.get("url", "") # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8 - if url.endswith('.m3u8') or url == '': + if url.endswith(".m3u8") or url == "": return - size = os.path.getsize(info_dict['filepath']) + size = os.path.getsize(info_dict["filepath"]) self.logger.info( - 'pushing %r video as %s (%s bytes) to ' - 'warcprox at %s with url %s', info_dict['format'], - mimetype, size, worker._proxy_for(site), url) - with open(info_dict['filepath'], 'rb') as f: + "pushing %r video as %s (%s bytes) to " "warcprox at %s with url %s", + info_dict["format"], + mimetype, + size, + worker._proxy_for(site), + url, + ) + with open(info_dict["filepath"], "rb") as f: # include content-length header to avoid chunked # transfer, which warcprox currently rejects extra_headers = dict(site.extra_headers()) - extra_headers['content-length'] = size + extra_headers["content-length"] = size request, response = worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), url=url, - warc_type='resource', content_type=mimetype, payload=f, - extra_headers=extra_headers) + warcprox_address=worker._proxy_for(site), + url=url, + warc_type="resource", + content_type=mimetype, + payload=f, + extra_headers=extra_headers, + ) # consulted by _remember_videos() - ydl.pushed_videos.append({ - 'url': url, - 'response_code': response.code, - 'content-type': mimetype, - 'content-length': size, - }) + ydl.pushed_videos.append( + { + "url": url, + "response_code": response.code, + "content-type": mimetype, + "content-length": size, + } + ) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: - if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES): + if ( + site.rr + and doublethink.utcnow() - site.last_claimed + > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES) + ): worker.logger.debug( - 'heartbeating site.last_claimed to prevent another ' - 'brozzler-worker claiming this site id=%r', site.id) + "heartbeating site.last_claimed to prevent another " + "brozzler-worker claiming this site id=%r", + site.id, + ) site.last_claimed = doublethink.utcnow() site.save() except: worker.logger.debug( - 'problem heartbeating site.last_claimed site id=%r', - site.id, exc_info=True) + "problem heartbeating site.last_claimed site id=%r", + site.id, + exc_info=True, + ) def ydl_postprocess_hook(d): - if d['status'] == 'finished': - worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') - worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) + if d["status"] == "finished": + worker.logger.info("[ydl_postprocess_hook] Finished postprocessing") + worker.logger.info( + "[ydl_postprocess_hook] postprocessor: {}".format(d["postprocessor"]) + ) if worker._using_warcprox(site): - _YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor']) + _YoutubeDL._push_video_to_warcprox( + _YoutubeDL, site, d["info_dict"], d["postprocessor"] + ) # default socket_timeout is 20 -- we hit it often when cluster is busy ydl_opts = { @@ -230,7 +265,6 @@ def ydl_postprocess_hook(d): "socket_timeout": 40, "progress_hooks": [maybe_heartbeat_site_last_claimed], "postprocessor_hooks": [ydl_postprocess_hook], - # https://github.com/yt-dlp/yt-dlp#format-selection # "By default, yt-dlp tries to download the best available quality..." # pre-v.2023.07.06: "format_sort": ["ext"], @@ -238,16 +272,13 @@ def ydl_postprocess_hook(d): # recommended: convert working cli to api call with # https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py "format": "b/bv+ba", - "format_sort": ["res:720","vcodec:h264","acodec:aac"], + "format_sort": ["res:720", "vcodec:h264", "acodec:aac"], # skip live streams "match_filter": match_filter_func("!is_live"), - - "extractor_args": {'youtube': {'skip': ['dash', 'hls']}}, - + "extractor_args": {"youtube": {"skip": ["dash", "hls"]}}, # --cache-dir local or.. # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal? "cache_dir": "/home/archiveit", - "logger": logging.getLogger("yt_dlp"), "verbose": False, "quiet": False, @@ -265,49 +296,53 @@ def ydl_postprocess_hook(d): ydl._opener.add_handler(ydl.fetch_spy) return ydl + def _remember_videos(page, fetches, pushed_videos=None): - ''' + """ Saves info about videos captured by yt-dlp in `page.videos`. - ''' - if not 'videos' in page: + """ + if not "videos" in page: page.videos = [] for fetch in fetches or []: - content_type = fetch['response_headers'].get_content_type() - if (content_type.startswith('video/') - # skip manifests of DASH segmented video - - # see https://github.com/internetarchive/brozzler/pull/70 - and content_type != 'video/vnd.mpeg.dash.mpd' - and fetch['method'] == 'GET' - and fetch['response_code'] in (200, 206)): + content_type = fetch["response_headers"].get_content_type() + if ( + content_type.startswith("video/") + # skip manifests of DASH segmented video - + # see https://github.com/internetarchive/brozzler/pull/70 + and content_type != "video/vnd.mpeg.dash.mpd" + and fetch["method"] == "GET" + and fetch["response_code"] in (200, 206) + ): video = { - 'blame': 'youtube-dl', - 'url': fetch['url'], - 'response_code': fetch['response_code'], - 'content-type': content_type, + "blame": "youtube-dl", + "url": fetch["url"], + "response_code": fetch["response_code"], + "content-type": content_type, } - if 'content-length' in fetch['response_headers']: - video['content-length'] = int( - fetch['response_headers']['content-length']) - if 'content-range' in fetch['response_headers']: + if "content-length" in fetch["response_headers"]: + video["content-length"] = int( + fetch["response_headers"]["content-length"] + ) + if "content-range" in fetch["response_headers"]: # skip chunked youtube video - if 'googlevideo.com/videoplayback' in fetch['url']: + if "googlevideo.com/videoplayback" in fetch["url"]: continue - video['content-range'] = fetch[ - 'response_headers']['content-range'] - logging.debug('embedded video %s', video) + video["content-range"] = fetch["response_headers"]["content-range"] + logging.debug("embedded video %s", video) page.videos.append(video) for pushed_video in pushed_videos or []: - if pushed_video['content-type'].startswith('video/'): + if pushed_video["content-type"].startswith("video/"): video = { - 'blame': 'youtube-dl', - 'url': pushed_video['url'], - 'response_code': pushed_video['response_code'], - 'content-type': pushed_video['content-type'], - 'content-length': pushed_video['content-length'], + "blame": "youtube-dl", + "url": pushed_video["url"], + "response_code": pushed_video["response_code"], + "content-type": pushed_video["content-type"], + "content-length": pushed_video["content-length"], } - logging.debug('embedded video %s', video) + logging.debug("embedded video %s", video) page.videos.append(video) + def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) @@ -317,43 +352,53 @@ def _try_youtube_dl(worker, ydl, site, page): # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info - ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) + ie_result = ydl.sanitize_info( + ydl.extract_info(str(urlcanon.whatwg(page.url))) + ) _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( - "sending WARCPROX_WRITE_RECORD request to warcprox " - "with yt-dlp json for %s", page) + "sending WARCPROX_WRITE_RECORD request to warcprox " + "with yt-dlp json for %s", + page, + ) worker._warcprox_write_record( - warcprox_address=worker._proxy_for(site), - url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), - warc_type="metadata", - content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers(page)) + warcprox_address=worker._proxy_for(site), + url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), + warc_type="metadata", + content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers(page), + ) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError: return None - elif (hasattr(e, "exc_info") - and e.exc_info[0] == urllib.error.HTTPError - and hasattr(e.exc_info[1], "code") - and e.exc_info[1].code == 420): + elif ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.HTTPError + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 420 + ): raise brozzler.ReachedLimit(e.exc_info[1]) - elif (hasattr(e, 'exc_info') - and e.exc_info[0] == urllib.error.URLError - and worker._proxy_for(site)): + elif ( + hasattr(e, "exc_info") + and e.exc_info[0] == urllib.error.URLError + and worker._proxy_for(site) + ): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( - 'yt-dlp hit apparent proxy error from ' - '%s' % page.url) from e + "yt-dlp hit apparent proxy error from " "%s" % page.url + ) from e else: raise + def do_youtube_dl(worker, site, page): - ''' + """ Runs yt-dlp configured for `worker` and `site` to download videos from `page`. @@ -372,15 +417,19 @@ def do_youtube_dl(worker, site, page): 'response_headers': ..., }, ...] `list` of `str`: outlink urls - ''' - with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: + """ + with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() - if ie_result and (ie_result.get('extractor') == 'youtube:playlist' or - ie_result.get('extractor') == 'youtube:tab'): + if ie_result and ( + ie_result.get("extractor") == "youtube:playlist" + or ie_result.get("extractor") == "youtube:tab" + ): # youtube watch pages as outlinks - outlinks = {'https://www.youtube.com/watch?v=%s' % e['id'] - for e in ie_result.get('entries_no_dl', [])} + outlinks = { + "https://www.youtube.com/watch?v=%s" % e["id"] + for e in ie_result.get("entries_no_dl", []) + } # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks diff --git a/setup.py b/setup.py index 5625ede5..e8fb25c3 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" setup.py - brozzler setup script Copyright (C) 2014-2024 Internet Archive @@ -15,89 +15,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import setuptools import os + def find_package_data(package): pkg_data = [] - depth = len(package.split('.')) - path = os.path.join(*package.split('.')) + depth = len(package.split(".")) + path = os.path.join(*package.split(".")) for dirpath, dirnames, filenames in os.walk(path): - if not os.path.exists(os.path.join(dirpath, '__init__.py')): + if not os.path.exists(os.path.join(dirpath, "__init__.py")): relpath = os.path.join(*dirpath.split(os.sep)[depth:]) pkg_data.extend(os.path.join(relpath, f) for f in filenames) return pkg_data + setuptools.setup( - name='brozzler', - version='1.5.44', - description='Distributed web crawling with browsers', - url='https://github.com/internetarchive/brozzler', - author='Noah Levitt', - author_email='nlevitt@archive.org', - long_description=open('README.rst', mode='rb').read().decode('UTF-8'), - license='Apache License 2.0', - packages=['brozzler', 'brozzler.dashboard'], - package_data={ - 'brozzler': [ - 'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'], - 'brozzler.dashboard': find_package_data('brozzler.dashboard'), - }, - entry_points={ - 'console_scripts': [ - 'brozzle-page=brozzler.cli:brozzle_page', - 'brozzler-new-job=brozzler.cli:brozzler_new_job', - 'brozzler-new-site=brozzler.cli:brozzler_new_site', - 'brozzler-worker=brozzler.cli:brozzler_worker', - 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', - 'brozzler-list-captures=brozzler.cli:brozzler_list_captures', - 'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs', - 'brozzler-list-sites=brozzler.cli:brozzler_list_sites', - 'brozzler-list-pages=brozzler.cli:brozzler_list_pages', - 'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl', - 'brozzler-purge=brozzler.cli:brozzler_purge', - 'brozzler-dashboard=brozzler.dashboard:main', - 'brozzler-easy=brozzler.easy:main', - 'brozzler-wayback=brozzler.pywb:main', - ], - }, - install_requires=[ - 'PyYAML>=5.1', - 'yt_dlp<2023.11.16', - 'reppy==0.3.4', - 'requests>=2.21', - 'websocket-client>=0.39.0,<=0.48.0', - 'pillow>=5.2.0', - 'urlcanon>=0.1.dev23', - 'doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311', - 'rethinkdb<2.4.10', - 'cerberus>=1.0.1', - 'jinja2>=2.10', - 'cryptography>=2.3', - 'python-magic>=0.4.15', + name="brozzler", + version="1.5.44", + description="Distributed web crawling with browsers", + url="https://github.com/internetarchive/brozzler", + author="Noah Levitt", + author_email="nlevitt@archive.org", + long_description=open("README.rst", mode="rb").read().decode("UTF-8"), + license="Apache License 2.0", + packages=["brozzler", "brozzler.dashboard"], + package_data={ + "brozzler": ["js-templates/*.js*", "behaviors.yaml", "job_schema.yaml"], + "brozzler.dashboard": find_package_data("brozzler.dashboard"), + }, + entry_points={ + "console_scripts": [ + "brozzle-page=brozzler.cli:brozzle_page", + "brozzler-new-job=brozzler.cli:brozzler_new_job", + "brozzler-new-site=brozzler.cli:brozzler_new_site", + "brozzler-worker=brozzler.cli:brozzler_worker", + "brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables", + "brozzler-list-captures=brozzler.cli:brozzler_list_captures", + "brozzler-list-jobs=brozzler.cli:brozzler_list_jobs", + "brozzler-list-sites=brozzler.cli:brozzler_list_sites", + "brozzler-list-pages=brozzler.cli:brozzler_list_pages", + "brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl", + "brozzler-purge=brozzler.cli:brozzler_purge", + "brozzler-dashboard=brozzler.dashboard:main", + "brozzler-easy=brozzler.easy:main", + "brozzler-wayback=brozzler.pywb:main", + ], + }, + install_requires=[ + "PyYAML>=5.1", + "yt_dlp<2023.11.16", + "reppy==0.3.4", + "requests>=2.21", + "websocket-client>=0.39.0,<=0.48.0", + "pillow>=5.2.0", + "urlcanon>=0.1.dev23", + "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311", + "rethinkdb<2.4.10", + "cerberus>=1.0.1", + "jinja2>=2.10", + "cryptography>=2.3", + "python-magic>=0.4.15", + ], + extras_require={ + "dashboard": ["flask>=1.0", "gunicorn>=19.8.1"], + "easy": [ + "warcprox>=2.4.31", + "pywb>=0.33.2,<2", + "flask>=1.0", + "gunicorn>=19.8.1", ], - extras_require={ - 'dashboard': [ - 'flask>=1.0', - 'gunicorn>=19.8.1' - ], - 'easy': [ - 'warcprox>=2.4.31', - 'pywb>=0.33.2,<2', - 'flask>=1.0', - 'gunicorn>=19.8.1' - ], - }, - zip_safe=False, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: System :: Archiving', - ]) + }, + zip_safe=False, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: Internet :: WWW/HTTP", + "Topic :: System :: Archiving", + ], +) diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index bd4a0328..744a09a2 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" test_brozzling.py - XXX explain Copyright (C) 2016-2018 Internet Archive @@ -15,7 +15,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import pytest import brozzler @@ -34,79 +34,81 @@ brozzler.cli.configure_logging(args) WARCPROX_META_420 = { - 'stats': { - 'test_limits_bucket': { - 'total': {'urls': 0, 'wire_bytes': 0}, - 'new': {'urls': 0, 'wire_bytes': 0}, - 'revisit': {'urls': 0, 'wire_bytes': 0}, - 'bucket': 'test_limits_bucket' + "stats": { + "test_limits_bucket": { + "total": {"urls": 0, "wire_bytes": 0}, + "new": {"urls": 0, "wire_bytes": 0}, + "revisit": {"urls": 0, "wire_bytes": 0}, + "bucket": "test_limits_bucket", } }, - 'reached-limit': {'test_limits_bucket/total/urls': 0} + "reached-limit": {"test_limits_bucket/total/urls": 0}, } -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def httpd(request): class RequestHandler(http.server.SimpleHTTPRequestHandler): def __init__(self, *args, **kwargs): - self.extensions_map['.mpd'] = 'video/vnd.mpeg.dash.mpd' + self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd" http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs) def do_GET(self): - if self.path == '/420': - self.send_response(420, 'Reached limit') - self.send_header('Connection', 'close') - self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420)) - payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n' - self.send_header('Content-Type', 'text/plain;charset=utf-8') - self.send_header('Content-Length', len(payload)) + if self.path == "/420": + self.send_response(420, "Reached limit") + self.send_header("Connection", "close") + self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420)) + payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n" + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Content-Length", len(payload)) self.end_headers() self.wfile.write(payload) - elif self.path == '/401': + elif self.path == "/401": self.send_response(401) - self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"') - self.send_header('Content-type', 'text/html') + self.send_header("WWW-Authenticate", 'Basic realm="Test"') + self.send_header("Content-type", "text/html") self.end_headers() - self.wfile.write(self.headers.get('Authorization', b'')) - self.wfile.write(b'not authenticated') + self.wfile.write(self.headers.get("Authorization", b"")) + self.wfile.write(b"not authenticated") else: super().do_GET() def do_POST(self): - if self.path == '/login-action': + if self.path == "/login-action": self.send_response(200) - payload = b'login successful\n' - self.send_header('Content-Type', 'text/plain;charset=utf-8') - self.send_header('Content-Length', len(payload)) + payload = b"login successful\n" + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Content-Length", len(payload)) self.end_headers() self.wfile.write(payload) else: super().do_POST() - # SimpleHTTPRequestHandler always uses CWD so we have to chdir - os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + os.chdir(os.path.join(os.path.dirname(__file__), "htdocs")) - httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd = http.server.HTTPServer(("localhost", 0), RequestHandler) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() def fin(): httpd.shutdown() httpd.server_close() httpd_thread.join() + request.addfinalizer(fin) return httpd + def test_httpd(httpd): - ''' + """ Tests that our http server is working as expected, and that two fetches of the same url return the same payload, proving it can be used to test deduplication. - ''' + """ payload1 = content2 = None - url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + url = "http://localhost:%s/site1/file1.txt" % httpd.server_port with urllib.request.urlopen(url) as response: assert response.status == 200 payload1 = response.read() @@ -119,123 +121,136 @@ def test_httpd(httpd): assert payload1 == payload2 - url = 'http://localhost:%s/420' % httpd.server_port + url = "http://localhost:%s/420" % httpd.server_port with pytest.raises(urllib.error.HTTPError) as excinfo: urllib.request.urlopen(url) assert excinfo.value.getcode() == 420 + def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.BrowsingException): - browser.browse_page('chrome://crash') + browser.browse_page("chrome://crash") + # chromium's 401 handling changed??? @pytest.mark.xfail def test_page_interstitial_exception(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() - url = 'http://localhost:%s/401' % httpd.server_port + url = "http://localhost:%s/401" % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.PageInterstitialShown): browser.browse_page(url) + def test_on_response(httpd): response_urls = [] + def on_response(msg): - response_urls.append(msg['params']['response']['url']) + response_urls.append(msg["params"]["response"]["url"]) chrome_exe = brozzler.suggest_default_chrome_exe() - url = 'http://localhost:%s/site3/page.html' % httpd.server_port + url = "http://localhost:%s/site3/page.html" % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(url, on_response=on_response) - assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port - assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port - assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port + assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port + assert ( + response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port + ) + assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port + def test_420(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() - url = 'http://localhost:%s/420' % httpd.server_port + url = "http://localhost:%s/420" % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ReachedLimit) as excinfo: browser.browse_page(url) assert excinfo.value.warcprox_meta == WARCPROX_META_420 + def test_js_dialogs(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() - url = 'http://localhost:%s/site4/alert.html' % httpd.server_port + url = "http://localhost:%s/site4/alert.html" % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: # before commit d2ed6b97a24 these would hang and eventually raise # brozzler.browser.BrowsingTimeout, which would cause this test to fail + browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port) browser.browse_page( - 'http://localhost:%s/site4/alert.html' % httpd.server_port) - browser.browse_page( - 'http://localhost:%s/site4/confirm.html' % httpd.server_port) - browser.browse_page( - 'http://localhost:%s/site4/prompt.html' % httpd.server_port) + "http://localhost:%s/site4/confirm.html" % httpd.server_port + ) + browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port) # XXX print dialog unresolved # browser.browse_page( # 'http://localhost:%s/site4/print.html' % httpd.server_port) + def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) - page = brozzler.Page(None, { - 'url':'http://localhost:%s/site6/' % httpd.server_port}) + page = brozzler.Page( + None, {"url": "http://localhost:%s/site6/" % httpd.server_port} + ) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 4 assert page.videos[0] == { - 'blame': 'youtube-dl', - 'response_code': 200, - 'content-length': 383631, - 'content-type': 'video/mp4', - 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, + "blame": "youtube-dl", + "response_code": 200, + "content-length": 383631, + "content-type": "video/mp4", + "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port, } assert page.videos[1] == { - 'blame': 'youtube-dl', - 'content-length': 92728, - 'content-type': 'video/webm', - 'response_code': 200, - 'url': 'http://localhost:%s/site6/small-video_280x160_100k.webm' % httpd.server_port + "blame": "youtube-dl", + "content-length": 92728, + "content-type": "video/webm", + "response_code": 200, + "url": "http://localhost:%s/site6/small-video_280x160_100k.webm" + % httpd.server_port, } assert page.videos[2] == { - 'blame': 'youtube-dl', - 'content-length': 101114, - 'content-type': 'video/webm', - 'response_code': 200, - 'url': 'http://localhost:%s/site6/small-audio.webm' % httpd.server_port + "blame": "youtube-dl", + "content-length": 101114, + "content-type": "video/webm", + "response_code": 200, + "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port, } assert page.videos[3] == { - 'blame': 'browser', + "blame": "browser", # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', - 'response_code': 200, - 'content-length': 229455, - 'content-type': 'video/webm', - 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, + "response_code": 200, + "content-length": 229455, + "content-type": "video/webm", + "url": "http://localhost:%s/site6/small.webm" % httpd.server_port, } + def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) - page = brozzler.Page(None, { - 'url':'http://localhost:%s/site8/' % httpd.server_port}) + page = brozzler.Page( + None, {"url": "http://localhost:%s/site8/" % httpd.server_port} + ) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { - 'http://example.com/offsite', - 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, - 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, - 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port + "http://example.com/offsite", + "http://localhost:%s/site8/baz/zuh" % httpd.server_port, + "http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port, + "http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port, } + def test_proxy_down(): - ''' + """ Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. @@ -243,40 +258,41 @@ def test_proxy_down(): Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections - ''' + """ sock = socket.socket() - sock.bind(('127.0.0.1', 0)) - for not_listening_proxy in ( - '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): - site = brozzler.Site(None, {'seed':'http://example.com/'}) - page = brozzler.Page(None, {'url': 'http://example.com/'}) - - worker = brozzler.BrozzlerWorker( - frontier=None, proxy=not_listening_proxy) + sock.bind(("127.0.0.1", 0)) + for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]): + site = brozzler.Site(None, {"seed": "http://example.com/"}) + page = brozzler.Page(None, {"url": "http://example.com/"}) + + worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page) + def test_try_login(httpd): - """Test try_login behavior. - """ + """Test try_login behavior.""" response_urls = [] + def on_response(msg): - response_urls.append(msg['params']['response']['url']) + response_urls.append(msg["params"]["response"]["url"]) + chrome_exe = brozzler.suggest_default_chrome_exe() - form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port - form_url_other = 'http://localhost:%s/site11/form2.html' % httpd.server_port - favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port - login_url = 'http://localhost:%s/login-action' % httpd.server_port + form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port + form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port + favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port + login_url = "http://localhost:%s/login-action" % httpd.server_port # When username and password are defined and initial page has login form, # detect login form, submit login, and then return to the initial page. - username = 'user1' - password = 'pass1' + username = "user1" + password = "pass1" with brozzler.Browser(chrome_exe=chrome_exe) as browser: - browser.browse_page(form_url, username=username, password=password, - on_response=on_response) + browser.browse_page( + form_url, username=username, password=password, on_response=on_response + ) assert len(response_urls) == 4 assert response_urls[0] == form_url assert response_urls[1] == favicon_url @@ -285,11 +301,15 @@ def on_response(msg): # We are now supporting a different type of form, we'll test that here. response_urls = [] - username = 'user1' - password = 'pass1' + username = "user1" + password = "pass1" with brozzler.Browser(chrome_exe=chrome_exe) as browser: - browser.browse_page(form_url_other, username=username, password=password, - on_response=on_response) + browser.browse_page( + form_url_other, + username=username, + password=password, + on_response=on_response, + ) assert len(response_urls) == 4 assert response_urls[0] == form_url_other assert response_urls[1] == favicon_url @@ -306,10 +326,16 @@ def on_response(msg): # when the page doesn't have a form with username/password, don't submit it response_urls = [] - form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port + form_without_login_url = ( + "http://localhost:%s/site11/form-no-login.html" % httpd.server_port + ) with brozzler.Browser(chrome_exe=chrome_exe) as browser: - browser.browse_page(form_without_login_url, username=username, - password=password, on_response=on_response) + browser.browse_page( + form_without_login_url, + username=username, + password=password, + on_response=on_response, + ) assert len(response_urls) == 2 assert response_urls[0] == form_without_login_url assert response_urls[1] == favicon_url diff --git a/tests/test_cli.py b/tests/test_cli.py index 03ec39a3..567260f9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" test_cli.py - test brozzler commands Copyright (C) 2017 Internet Archive @@ -15,7 +15,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import brozzler.cli import pkg_resources @@ -23,59 +23,62 @@ import subprocess import doublethink + def cli_commands(): - commands = set(pkg_resources.get_entry_map( - 'brozzler')['console_scripts'].keys()) - commands.remove('brozzler-wayback') + commands = set(pkg_resources.get_entry_map("brozzler")["console_scripts"].keys()) + commands.remove("brozzler-wayback") try: import gunicorn except ImportError: - commands.remove('brozzler-dashboard') + commands.remove("brozzler-dashboard") try: import pywb except ImportError: - commands.remove('brozzler-easy') + commands.remove("brozzler-easy") return commands -@pytest.mark.parametrize('cmd', cli_commands()) + +@pytest.mark.parametrize("cmd", cli_commands()) def test_call_entrypoint(capsys, cmd): - entrypoint = pkg_resources.get_entry_map( - 'brozzler')['console_scripts'][cmd] + entrypoint = pkg_resources.get_entry_map("brozzler")["console_scripts"][cmd] callable = entrypoint.resolve() with pytest.raises(SystemExit): - callable(['/whatever/bin/%s' % cmd, '--version']) + callable(["/whatever/bin/%s" % cmd, "--version"]) out, err = capsys.readouterr() - assert out == 'brozzler %s - %s\n' % (brozzler.__version__, cmd) - assert err == '' + assert out == "brozzler %s - %s\n" % (brozzler.__version__, cmd) + assert err == "" + -@pytest.mark.parametrize('cmd', cli_commands()) +@pytest.mark.parametrize("cmd", cli_commands()) def test_run_command(capsys, cmd): proc = subprocess.Popen( - [cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + [cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) out, err = proc.communicate() - assert err == b'' - assert out == ('brozzler %s - %s\n' % ( - brozzler.__version__, cmd)).encode('ascii') + assert err == b"" + assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii") + def test_rethinkdb_up(): - '''Check that rethinkdb is up and running.''' + """Check that rethinkdb is up and running.""" # check that rethinkdb is listening and looks sane - rr = doublethink.Rethinker(db='rethinkdb') # built-in db + rr = doublethink.Rethinker(db="rethinkdb") # built-in db tbls = rr.table_list().run() assert len(tbls) > 10 + # XXX don't know why this test is failing in travis-ci and vagrant while # test_call_entrypoint tests pass :( (also fails with capfd) @pytest.mark.xfail def test_stop_nonexistent_crawl(capsys): with pytest.raises(SystemExit): - brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--site=123']) + brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=123"]) out, err = capsys.readouterr() - assert err.endswith('site not found with id=123\n') - assert out == '' + assert err.endswith("site not found with id=123\n") + assert out == "" with pytest.raises(SystemExit): - brozzler.cli.brozzler_stop_crawl(['brozzler-stop-crawl', '--job=abc']) + brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=abc"]) out, err = capsys.readouterr() - assert err.endswith('''job not found with id='abc'\n''') - assert out == '' + assert err.endswith("""job not found with id='abc'\n""") + assert out == "" diff --git a/tests/test_cluster.py b/tests/test_cluster.py index fcff145d..51d78e32 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" test_cluster.py - integration tests for a brozzler cluster, expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be running already @@ -16,7 +16,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import pytest import http.server @@ -35,57 +35,62 @@ import sys import warcprox + # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib def _local_address(): import socket + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: - s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable + s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable return s.getsockname()[0] except: - return '127.0.0.1' + return "127.0.0.1" finally: s.close() + local_address = _local_address() + def start_service(service): - subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service]) + subprocess.check_call(["sudo", "svc", "-u", "/etc/service/" + service]) + def stop_service(service): - subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service]) + subprocess.check_call(["sudo", "svc", "-d", "/etc/service/" + service]) while True: - status = subprocess.check_output( - ['sudo', 'svstat', '/etc/service/' + service]) - if b' down ' in status: + status = subprocess.check_output(["sudo", "svstat", "/etc/service/" + service]) + if b" down " in status: break time.sleep(0.5) -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def httpd(request): class RequestHandler(http.server.SimpleHTTPRequestHandler): def do_POST(self): - logging.info('\n%s\n%s', self.requestline, self.headers) + logging.info("\n%s\n%s", self.requestline, self.headers) self.do_GET() def do_GET(self): - logging.info('\n%s\n%s', self.requestline, self.headers) - if self.path == '/site5/redirect/': - self.send_response(303, 'See other') - self.send_header('Connection', 'close') - self.send_header('Content-Length', 0) - self.send_header('Location', '/site5/destination/') + logging.info("\n%s\n%s", self.requestline, self.headers) + if self.path == "/site5/redirect/": + self.send_response(303, "See other") + self.send_header("Connection", "close") + self.send_header("Content-Length", 0) + self.send_header("Location", "/site5/destination/") self.end_headers() - self.wfile.write(b'') - elif self.path == '/site9/redirect.html': - self.send_response(303, 'See other') - self.send_header('Connection', 'close') - self.send_header('Content-Length', 0) - self.send_header('Location', '/site9/destination.html') + self.wfile.write(b"") + elif self.path == "/site9/redirect.html": + self.send_response(303, "See other") + self.send_header("Connection", "close") + self.send_header("Content-Length", 0) + self.send_header("Location", "/site9/destination.html") self.end_headers() - self.wfile.write(b'') - elif self.path.startswith('/infinite/'): - payload = b''' + self.wfile.write(b"") + elif self.path.startswith("/infinite/"): + payload = b""" infinite site @@ -96,41 +101,44 @@ def do_GET(self): g/ h/ i/ -''' - self.send_response(200, 'OK') - self.send_header('Connection', 'close') - self.send_header('Content-Length', len(payload)) +""" + self.send_response(200, "OK") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(payload)) self.end_headers() self.wfile.write(payload) else: super().do_GET() # SimpleHTTPRequestHandler always uses CWD so we have to chdir - os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + os.chdir(os.path.join(os.path.dirname(__file__), "htdocs")) httpd = http.server.HTTPServer((local_address, 0), RequestHandler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() def fin(): httpd.shutdown() httpd.server_close() httpd_thread.join() + request.addfinalizer(fin) return httpd + def make_url(httpd, rel_url): - return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url) + return "http://%s:%s%s" % (local_address, httpd.server_port, rel_url) + def test_httpd(httpd): - ''' + """ Tests that our http server is working as expected, and that two fetches of the same url return the same payload, proving it can be used to test deduplication. - ''' + """ payload1 = content2 = None - url = make_url(httpd, '/site1/file1.txt') + url = make_url(httpd, "/site1/file1.txt") with urllib.request.urlopen(url) as response: assert response.status == 200 payload1 = response.read() @@ -143,43 +151,49 @@ def test_httpd(httpd): assert payload1 == payload2 + def test_services_up(): - '''Check that the expected services are up and running.''' + """Check that the expected services are up and running.""" # check that rethinkdb is listening and looks sane - rr = doublethink.Rethinker(db='rethinkdb') # built-in db + rr = doublethink.Rethinker(db="rethinkdb") # built-in db tbls = rr.table_list().run() assert len(tbls) > 10 # check that warcprox is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails - s.connect(('localhost', 8000)) + s.connect(("localhost", 8000)) # check that pywb is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails - s.connect(('localhost', 8880)) + s.connect(("localhost", 8880)) # check that brozzler dashboard is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails - s.connect(('localhost', 8881)) + s.connect(("localhost", 8881)) + def test_brozzle_site(httpd): - test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site1/'), - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site1/"), + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) # the two pages we expect to be crawled - page1 = make_url(httpd, '/site1/') - page2 = make_url(httpd, '/site1/file1.txt') - robots = make_url(httpd, '/robots.txt') + page1 = make_url(httpd, "/site1/") + page2 = make_url(httpd, "/site1/file1.txt") + robots = make_url(httpd, "/robots.txt") # so we can examine rethinkdb before it does anything try: - stop_service('brozzler-worker') + stop_service("brozzler-worker") assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) @@ -187,148 +201,173 @@ def test_brozzle_site(httpd): assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 finally: - start_service('brozzler-worker') + start_service("brozzler-worker") # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { - make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')} + make_url(httpd, "/site1/"), + make_url(httpd, "/site1/file1.txt"), + } - time.sleep(2) # in case warcprox hasn't finished processing urls + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table - captures = rr.table('captures').filter({'test_id':test_id}).run() - captures_by_url = { - c['url']: c for c in captures if c['http_method'] != 'HEAD'} + captures = rr.table("captures").filter({"test_id": test_id}).run() + captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"} assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url - assert 'screenshot:%s' % page1 in captures_by_url - assert 'thumbnail:%s' % page1 in captures_by_url + assert "screenshot:%s" % page1 in captures_by_url + assert "thumbnail:%s" % page1 in captures_by_url # no screenshots of plaintext # check pywb - t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) - expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() + t14 = captures_by_url[page2]["timestamp"].strftime("%Y%m%d%H%M%S") + wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, page2) + expected_payload = open( + os.path.join(os.path.dirname(__file__), "htdocs", "site1", "file1.txt"), "rb" + ).read() assert requests.get(wb_url).content == expected_payload - url = 'screenshot:%s' % page1 - t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) + url = "screenshot:%s" % page1 + t14 = captures_by_url[url]["timestamp"].strftime("%Y%m%d%H%M%S") + wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 - assert response.headers['content-type'] == 'image/jpeg' + assert response.headers["content-type"] == "image/jpeg" - url = 'thumbnail:%s' % page1 - t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) + url = "thumbnail:%s" % page1 + t14 = captures_by_url[url]["timestamp"].strftime("%Y%m%d%H%M%S") + wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 - assert response.headers['content-type'] == 'image/jpeg' + assert response.headers["content-type"] == "image/jpeg" + def test_proxy_warcprox(httpd): - '''Test --proxy with proxy that happens to be warcprox''' + """Test --proxy with proxy that happens to be warcprox""" try: - stop_service('brozzler-worker') + stop_service("brozzler-worker") _test_proxy_setting( - httpd, proxy='localhost:8000', warcprox_auto=False, - is_warcprox=True) + httpd, proxy="localhost:8000", warcprox_auto=False, is_warcprox=True + ) finally: - start_service('brozzler-worker') + start_service("brozzler-worker") + def test_proxy_non_warcprox(httpd): - '''Test --proxy with proxy that happens not to be warcprox''' + """Test --proxy with proxy that happens not to be warcprox""" + class DumbProxyRequestHandler(http.server.SimpleHTTPRequestHandler): def do_HEAD(self): - if not hasattr(self.server, 'requests'): + if not hasattr(self.server, "requests"): self.server.requests = [] - logging.info('%s %s', self.command, self.path) - self.server.requests.append('%s %s' % (self.command, self.path)) + logging.info("%s %s", self.command, self.path) + self.server.requests.append("%s %s" % (self.command, self.path)) response = urllib.request.urlopen(self.path) - self.wfile.write(('HTTP/1.0 %s %s\r\n' % ( - response.code, response.reason)).encode('ascii')) + self.wfile.write( + ("HTTP/1.0 %s %s\r\n" % (response.code, response.reason)).encode( + "ascii" + ) + ) for header in response.getheaders(): - self.wfile.write(('%s: %s\r\n' % ( - header[0], header[1])).encode('ascii')) - self.wfile.write(b'\r\n') + self.wfile.write( + ("%s: %s\r\n" % (header[0], header[1])).encode("ascii") + ) + self.wfile.write(b"\r\n") return response + def do_GET(self): response = self.do_HEAD() self.copyfile(response, self.wfile) + def do_WARCPROX_WRITE_RECORD(self): - if not hasattr(self.server, 'requests'): + if not hasattr(self.server, "requests"): self.server.requests = [] - logging.info('%s %s', self.command, self.path) + logging.info("%s %s", self.command, self.path) self.send_error(400) - proxy = http.server.HTTPServer(('localhost', 0), DumbProxyRequestHandler) - th = threading.Thread(name='dumb-proxy', target=proxy.serve_forever) + proxy = http.server.HTTPServer(("localhost", 0), DumbProxyRequestHandler) + th = threading.Thread(name="dumb-proxy", target=proxy.serve_forever) th.start() try: - stop_service('brozzler-worker') + stop_service("brozzler-worker") _test_proxy_setting( - httpd, proxy='localhost:%s' % proxy.server_port, - warcprox_auto=False, is_warcprox=False) + httpd, + proxy="localhost:%s" % proxy.server_port, + warcprox_auto=False, + is_warcprox=False, + ) finally: - start_service('brozzler-worker') + start_service("brozzler-worker") assert len(proxy.requests) <= 15 - assert proxy.requests.count('GET /status') == 1 - assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests - assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests - assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == [] + assert proxy.requests.count("GET /status") == 1 + assert ("GET %s" % make_url(httpd, "/site1/")) in proxy.requests + assert ("GET %s" % make_url(httpd, "/site1/file1.txt")) in proxy.requests + assert [ + req for req in proxy.requests if req.startswith("WARCPROX_WRITE_RECORD") + ] == [] proxy.shutdown() th.join() + def test_no_proxy(httpd): try: - stop_service('brozzler-worker') - _test_proxy_setting( - httpd, proxy=None, warcprox_auto=False, is_warcprox=False) + stop_service("brozzler-worker") + _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False) finally: - start_service('brozzler-worker') + start_service("brozzler-worker") # XXX how to check that no proxy was used? + def test_warcprox_auto(httpd): - '''Test --warcprox-auto''' + """Test --warcprox-auto""" try: - stop_service('brozzler-worker') - _test_proxy_setting( - httpd, proxy=None, warcprox_auto=True, is_warcprox=True) + stop_service("brozzler-worker") + _test_proxy_setting(httpd, proxy=None, warcprox_auto=True, is_warcprox=True) finally: - start_service('brozzler-worker') + start_service("brozzler-worker") + def test_proxy_conflict(): with pytest.raises(AssertionError) as excinfo: worker = brozzler.worker.BrozzlerWorker( - None, None, warcprox_auto=True, proxy='localhost:12345') + None, None, warcprox_auto=True, proxy="localhost:12345" + ) + -def _test_proxy_setting( - httpd, proxy=None, warcprox_auto=False, is_warcprox=False): - test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( - proxy, warcprox_auto, is_warcprox, - datetime.datetime.utcnow().isoformat()) +def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False): + test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % ( + proxy, + warcprox_auto, + is_warcprox, + datetime.datetime.utcnow().isoformat(), + ) # the two pages we expect to be crawled - page1 = make_url(httpd, '/site1/') - page2 = make_url(httpd, '/site1/file1.txt') - robots = make_url(httpd, '/robots.txt') + page1 = make_url(httpd, "/site1/") + page2 = make_url(httpd, "/site1/file1.txt") + robots = make_url(httpd, "/robots.txt") - rr = doublethink.Rethinker('localhost', db='brozzler') + rr = doublethink.Rethinker("localhost", db="brozzler") service_registry = doublethink.ServiceRegistry(rr) - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site1/'), - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site1/"), + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -336,23 +375,27 @@ def _test_proxy_setting( assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( - frontier, service_registry, max_browsers=1, - chrome_exe=brozzler.suggest_default_chrome_exe(), - warcprox_auto=warcprox_auto, proxy=proxy) + frontier, + service_registry, + max_browsers=1, + chrome_exe=brozzler.suggest_default_chrome_exe(), + warcprox_auto=warcprox_auto, + proxy=proxy, + ) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set - assert site.status == 'FINISHED' + assert site.status == "FINISHED" if warcprox_auto: - assert site.proxy[-5:] == ':8000' + assert site.proxy[-5:] == ":8000" else: assert not site.proxy - site.refresh() # check that these things were persisted - assert site.status == 'FINISHED' + site.refresh() # check that these things were persisted + assert site.status == "FINISHED" if warcprox_auto: - assert site.proxy[-5:] == ':8000' + assert site.proxy[-5:] == ":8000" else: assert not site.proxy @@ -360,41 +403,48 @@ def _test_proxy_setting( pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { - make_url(httpd, '/site1/'), - make_url(httpd, '/site1/file1.txt')} + make_url(httpd, "/site1/"), + make_url(httpd, "/site1/file1.txt"), + } - time.sleep(2) # in case warcprox hasn't finished processing urls + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table - captures = rr.table('captures').filter({'test_id':test_id}).run() - captures_by_url = { - c['url']: c for c in captures if c['http_method'] != 'HEAD'} + captures = rr.table("captures").filter({"test_id": test_id}).run() + captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url - assert 'screenshot:%s' % page1 in captures_by_url - assert 'thumbnail:%s' % page1 in captures_by_url + assert "screenshot:%s" % page1 in captures_by_url + assert "thumbnail:%s" % page1 in captures_by_url # check pywb - t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) - expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() + t14 = captures_by_url[page2]["timestamp"].strftime("%Y%m%d%H%M%S") + wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, page2) + expected_payload = open( + os.path.join(os.path.dirname(__file__), "htdocs", "site1", "file1.txt"), + "rb", + ).read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {} + def test_obey_robots(httpd): - test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site1/'), - 'user_agent': 'im a badbot', # robots.txt blocks badbot - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site1/"), + "user_agent": "im a badbot", # robots.txt blocks badbot + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) # so we can examine rethinkdb before it does anything try: - stop_service('brozzler-worker') + stop_service("brozzler-worker") assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) @@ -405,84 +455,111 @@ def test_obey_robots(httpd): assert site_pages[0].url == site.seed assert site_pages[0].needs_robots_check finally: - start_service('brozzler-worker') + start_service("brozzler-worker") # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] - assert page.url == make_url(httpd, '/site1/') + assert page.url == make_url(httpd, "/site1/") assert page.blocked_by_robots # take a look at the captures table - time.sleep(2) # in case warcprox hasn't finished processing urls - robots_url = make_url(httpd, '/robots.txt') - captures = list(rr.table('captures').filter({'test_id':test_id}).run()) + time.sleep(2) # in case warcprox hasn't finished processing urls + robots_url = make_url(httpd, "/robots.txt") + captures = list(rr.table("captures").filter({"test_id": test_id}).run()) assert len(captures) == 1 - assert captures[0]['url'] == robots_url + assert captures[0]["url"] == robots_url # check pywb - t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) - expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() - assert requests.get( - wb_url, allow_redirects=False).content == expected_payload + t14 = captures[0]["timestamp"].strftime("%Y%m%d%H%M%S") + wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, robots_url) + expected_payload = open( + os.path.join(os.path.dirname(__file__), "htdocs", "robots.txt"), "rb" + ).read() + assert requests.get(wb_url, allow_redirects=False).content == expected_payload + def test_login(httpd): - test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site2/'), - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, - 'username': 'test_username', 'password': 'test_password'}) + test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site2/"), + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + "username": "test_username", + "password": "test_password", + }, + ) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # take a look at the captures table - time.sleep(2) # in case warcprox hasn't finished processing urls - robots_url = make_url(httpd, '/robots.txt') - captures = list(rr.table('captures').filter( - {'test_id':test_id}).order_by('timestamp').run()) - meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] + time.sleep(2) # in case warcprox hasn't finished processing urls + robots_url = make_url(httpd, "/robots.txt") + captures = list( + rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run() + ) + meth_url = ["%s %s" % (c["http_method"], c["url"]) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here - assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url + assert ("POST %s" % make_url(httpd, "/site2/00")) in meth_url # sanity check the rest of the crawl - assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url - assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url - assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url - assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url - assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url - assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url - assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url + assert ("GET %s" % make_url(httpd, "/robots.txt")) in meth_url + assert ("GET %s" % make_url(httpd, "/site2/")) in meth_url + assert ( + "WARCPROX_WRITE_RECORD screenshot:%s" % make_url(httpd, "/site2/") + ) in meth_url + assert ( + "WARCPROX_WRITE_RECORD thumbnail:%s" % make_url(httpd, "/site2/") + ) in meth_url + assert ("GET %s" % make_url(httpd, "/site2/login.html")) in meth_url + assert ( + "WARCPROX_WRITE_RECORD screenshot:%s" % make_url(httpd, "/site2/login.html") + ) in meth_url + assert ( + "WARCPROX_WRITE_RECORD thumbnail:%s" % make_url(httpd, "/site2/login.html") + ) in meth_url + def test_seed_redirect(httpd): - test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = make_url(httpd, '/site5/redirect/') - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site5/redirect/'), - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]} + test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + seed_url = make_url(httpd, "/site5/redirect/") + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site5/redirect/"), + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) + assert site.scope == { + "accepts": [ + { + "ssurt": "%s//%s:http:/site5/redirect/" + % (local_address, httpd.server_port) + } + ] + } frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -490,10 +567,10 @@ def test_seed_redirect(httpd): # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # take a look at the pages table pages = list(frontier.site_pages(site.id)) @@ -501,22 +578,36 @@ def test_seed_redirect(httpd): pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url - assert pages[0].redirect_url == make_url(httpd, '/site5/destination/') + assert pages[0].redirect_url == make_url(httpd, "/site5/destination/") assert pages[1].hops_from_seed == 1 - assert pages[1].url == make_url(httpd, '/site5/destination/page2.html') + assert pages[1].url == make_url(httpd, "/site5/destination/page2.html") # check that scope has been updated properly - assert site.scope == {'accepts': [ - {'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}, - {'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]} + assert site.scope == { + "accepts": [ + { + "ssurt": "%s//%s:http:/site5/redirect/" + % (local_address, httpd.server_port) + }, + { + "ssurt": "%s//%s:http:/site5/destination/" + % (local_address, httpd.server_port) + }, + ] + } + def test_hashtags(httpd): - test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = make_url(httpd, '/site7/') - site = brozzler.Site(rr, { - 'seed': seed_url, - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + seed_url = make_url(httpd, "/site7/") + site = brozzler.Site( + rr, + { + "seed": seed_url, + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -524,10 +615,10 @@ def test_hashtags(httpd): # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) @@ -535,34 +626,42 @@ def test_hashtags(httpd): assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 - assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')] + assert pages[0].outlinks["accepted"] == [make_url(httpd, "/site7/foo.html")] assert not pages[0].hashtags - assert pages[1].url == make_url(httpd, '/site7/foo.html') + assert pages[1].url == make_url(httpd, "/site7/foo.html") assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 - assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',] + assert sorted(pages[1].hashtags) == [ + "#boosh", + "#ignored", + "#whee", + ] - time.sleep(2) # in case warcprox hasn't finished processing urls + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table - captures = rr.table('captures').filter({'test_id':test_id}).run() - captures_by_url = { - c['url']: c for c in captures if c['http_method'] != 'HEAD'} + captures = rr.table("captures").filter({"test_id": test_id}).run() + captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"} assert seed_url in captures_by_url - assert make_url(httpd, '/site7/foo.html') in captures_by_url - assert make_url(httpd, '/site7/whee.txt') in captures_by_url - assert make_url(httpd, '/site7/boosh.txt') in captures_by_url - assert 'screenshot:%s' % seed_url in captures_by_url - assert 'thumbnail:%s' % seed_url in captures_by_url - assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url - assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url + assert make_url(httpd, "/site7/foo.html") in captures_by_url + assert make_url(httpd, "/site7/whee.txt") in captures_by_url + assert make_url(httpd, "/site7/boosh.txt") in captures_by_url + assert "screenshot:%s" % seed_url in captures_by_url + assert "thumbnail:%s" % seed_url in captures_by_url + assert "screenshot:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url + assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url + def test_redirect_hashtags(httpd): - test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = make_url(httpd, '/site9/') - site = brozzler.Site(rr, { - 'seed': seed_url, - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") + seed_url = make_url(httpd, "/site9/") + site = brozzler.Site( + rr, + { + "seed": seed_url, + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -570,10 +669,10 @@ def test_redirect_hashtags(httpd): # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) @@ -581,18 +680,26 @@ def test_redirect_hashtags(httpd): assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 - assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')] + assert pages[0].outlinks["accepted"] == [make_url(httpd, "/site9/redirect.html")] assert not pages[0].hashtags - assert pages[1].url == make_url(httpd, '/site9/redirect.html') + assert pages[1].url == make_url(httpd, "/site9/redirect.html") assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 - assert sorted(pages[1].hashtags) == ['#hash1','#hash2',] + assert sorted(pages[1].hashtags) == [ + "#hash1", + "#hash2", + ] - time.sleep(2) # in case warcprox hasn't finished processing urls + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table - captures = rr.table('captures').filter({'test_id':test_id}).run() - redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET'] - assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags + captures = rr.table("captures").filter({"test_id": test_id}).run() + redirect_captures = [ + c + for c in captures + if c["url"] == make_url(httpd, "/site9/redirect.html") + and c["http_method"] == "GET" + ] + assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags # === expected captures === # 1. GET http://localhost:41243/favicon.ico @@ -610,16 +717,20 @@ def test_redirect_hashtags(httpd): # 13. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/ # 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html + def test_stop_crawl(httpd): - test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') + test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) # create a new job with three sites that could be crawled forever - job_conf = {'seeds': [ - {'url': make_url(httpd, '/infinite/foo/')}, - {'url': make_url(httpd, '/infinite/bar/')}, - {'url': make_url(httpd, '/infinite/baz/')}]} + job_conf = { + "seeds": [ + {"url": make_url(httpd, "/infinite/foo/")}, + {"url": make_url(httpd, "/infinite/bar/")}, + {"url": make_url(httpd, "/infinite/baz/")}, + ] + } job = brozzler.new_job(frontier, job_conf) assert job.id @@ -628,51 +739,48 @@ def test_stop_crawl(httpd): assert not sites[1].stop_requested # request crawl stop for one site using the command line entrypoint - brozzler.cli.brozzler_stop_crawl([ - 'brozzler-stop-crawl', '--site=%s' % sites[0].id]) + brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=%s" % sites[0].id]) sites[0].refresh() assert sites[0].stop_requested # stop request should be honored quickly start = time.time() - while not sites[0].status.startswith( - 'FINISHED') and time.time() - start < 120: + while not sites[0].status.startswith("FINISHED") and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() - assert sites[0].status == 'FINISHED_STOP_REQUESTED' + assert sites[0].status == "FINISHED_STOP_REQUESTED" # but the other sites and the job as a whole should still be crawling sites[1].refresh() - assert sites[1].status == 'ACTIVE' + assert sites[1].status == "ACTIVE" sites[2].refresh() - assert sites[2].status == 'ACTIVE' + assert sites[2].status == "ACTIVE" job.refresh() - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" # request crawl stop for the job using the command line entrypoint - brozzler.cli.brozzler_stop_crawl([ - 'brozzler-stop-crawl', '--job=%s' % job.id]) + brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=%s" % job.id]) job.refresh() assert job.stop_requested # stop request should be honored quickly start = time.time() - while not job.status.startswith( - 'FINISHED') and time.time() - start < 120: + while not job.status.startswith("FINISHED") and time.time() - start < 120: time.sleep(0.5) job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" # the other sites should also be FINISHED_STOP_REQUESTED sites[0].refresh() - assert sites[0].status == 'FINISHED_STOP_REQUESTED' + assert sites[0].status == "FINISHED_STOP_REQUESTED" sites[1].refresh() - assert sites[1].status == 'FINISHED_STOP_REQUESTED' + assert sites[1].status == "FINISHED_STOP_REQUESTED" sites[2].refresh() - assert sites[2].status == 'FINISHED_STOP_REQUESTED' + assert sites[2].status == "FINISHED_STOP_REQUESTED" + def test_warcprox_outage_resiliency(httpd): - ''' + """ Tests resiliency to warcprox outage. If no instances of warcprox are healthy when starting to crawl a site, @@ -682,37 +790,43 @@ def test_warcprox_outage_resiliency(httpd): over to a healthy instance. If all instances of warcprox go down, brozzler-worker should sit and wait. - ''' - rr = doublethink.Rethinker('localhost', db='brozzler') + """ + rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() - opts.address = '0.0.0.0' + opts.address = "0.0.0.0" opts.port = 0 - opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services' + opts.rethinkdb_services_url = "rethinkdb://localhost/brozzler/services" warcprox1 = warcprox.controller.WarcproxController(opts) warcprox2 = warcprox.controller.WarcproxController(opts) warcprox1_thread = threading.Thread( - target=warcprox1.run_until_shutdown, name='warcprox1') + target=warcprox1.run_until_shutdown, name="warcprox1" + ) warcprox2_thread = threading.Thread( - target=warcprox2.run_until_shutdown, name='warcprox2') + target=warcprox2.run_until_shutdown, name="warcprox2" + ) # put together a site to crawl - test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/infinite/'), - 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + test_id = "test_warcprox_death-%s" % datetime.datetime.utcnow().isoformat() + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/infinite/"), + "warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}}, + }, + ) try: # we manage warcprox instances ourselves, so stop the one running on # the system, if any try: - stop_service('warcprox') + stop_service("warcprox") except Exception as e: - logging.warning('problem stopping warcprox service: %s', e) + logging.warning("problem stopping warcprox service: %s", e) # queue the site for brozzling brozzler.new_site(frontier, site) @@ -721,7 +835,7 @@ def test_warcprox_outage_resiliency(httpd): # XXX tail brozzler-worker.log or something? time.sleep(30) site.refresh() - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" assert not site.proxy assert len(list(frontier.site_pages(site.id))) == 1 @@ -733,13 +847,12 @@ def test_warcprox_outage_resiliency(httpd): while not site.proxy and time.time() - start < 30: time.sleep(0.5) site.refresh() - assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port) + assert site.proxy.endswith(":%s" % warcprox1.proxy.server_port) # check that the site accumulates pages in the frontier, confirming # that crawling is really happening start = time.time() - while (len(list(frontier.site_pages(site.id))) <= 1 - and time.time() - start < 60): + while len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60: time.sleep(0.5) site.refresh() assert len(list(frontier.site_pages(site.id))) > 1 @@ -751,12 +864,13 @@ def test_warcprox_outage_resiliency(httpd): # check that it switched over to warcprox #2 start = time.time() - while ((not site.proxy - or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port)) - and time.time() - start < 30): + while ( + not site.proxy + or not site.proxy.endswith(":%s" % warcprox2.proxy.server_port) + ) and time.time() - start < 30: time.sleep(0.5) site.refresh() - assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port) + assert site.proxy.endswith(":%s" % warcprox2.proxy.server_port) # stop warcprox #2 warcprox2.stop.set() @@ -768,39 +882,36 @@ def test_warcprox_outage_resiliency(httpd): # check that it is waiting for a warcprox to appear time.sleep(30) site.refresh() - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" assert not site.proxy assert len(list(frontier.site_pages(site.id))) == page_count # stop crawling the site, else it can pollute subsequent test runs - brozzler.cli.brozzler_stop_crawl([ - 'brozzler-stop-crawl', '--site=%s' % site.id]) + brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=%s" % site.id]) site.refresh() assert site.stop_requested # stop request should be honored quickly start = time.time() - while not site.status.startswith( - 'FINISHED') and time.time() - start < 120: + while not site.status.startswith("FINISHED") and time.time() - start < 120: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED_STOP_REQUESTED' + assert site.status == "FINISHED_STOP_REQUESTED" finally: warcprox1.stop.set() warcprox2.stop.set() warcprox1_thread.join() warcprox2_thread.join() - start_service('warcprox') + start_service("warcprox") + def test_time_limit(httpd): - test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') + test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) # create a new job with one seed that could be crawled forever - job_conf = {'seeds': [{ - 'url': make_url(httpd, '/infinite/foo/'), - 'time_limit': 20}]} + job_conf = {"seeds": [{"url": make_url(httpd, "/infinite/foo/"), "time_limit": 20}]} job = brozzler.new_job(frontier, job_conf) assert job.id @@ -810,58 +921,63 @@ def test_time_limit(httpd): # time limit should be enforced pretty soon start = time.time() - while not sites[0].status.startswith( - 'FINISHED') and time.time() - start < 120: + while not sites[0].status.startswith("FINISHED") and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() - assert sites[0].status == 'FINISHED_TIME_LIMIT' + assert sites[0].status == "FINISHED_TIME_LIMIT" # all sites finished so job should be finished too start = time.time() job.refresh() - while not job.status == 'FINISHED' and time.time() - start < 10: + while not job.status == "FINISHED" and time.time() - start < 10: time.sleep(0.5) job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" + def test_ydl_stitching(httpd): - test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat() - rr = doublethink.Rethinker('localhost', db='brozzler') + test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker("localhost", db="brozzler") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, { - 'seed': make_url(httpd, '/site10/'), - 'warcprox_meta': { - 'warc-prefix': 'test_ydl_stitching', - 'captures-table-extra-fields': {'test_id':test_id}}}) + site = brozzler.Site( + rr, + { + "seed": make_url(httpd, "/site10/"), + "warcprox_meta": { + "warc-prefix": "test_ydl_stitching", + "captures-table-extra-fields": {"test_id": test_id}, + }, + }, + ) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: + while site.status != "FINISHED" and time.time() - start < 300: time.sleep(0.5) site.refresh() - assert site.status == 'FINISHED' + assert site.status == "FINISHED" # check page.videos pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] assert len(page.videos) == 6 - stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/') + stitched_url = "youtube-dl:00001:%s" % make_url(httpd, "/site10/") assert { - 'blame': 'youtube-dl', - 'content-length': 267900, - 'content-type': 'video/mp4', - 'response_code': 204, - 'url': stitched_url, + "blame": "youtube-dl", + "content-length": 267900, + "content-type": "video/mp4", + "response_code": 204, + "url": stitched_url, } in page.videos - time.sleep(2) # in case warcprox hasn't finished processing urls + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table - captures = list(rr.table('captures').filter({'test_id':test_id}).run()) - l = [c for c in captures if c['url'] == stitched_url] + captures = list(rr.table("captures").filter({"test_id": test_id}).run()) + l = [c for c in captures if c["url"] == stitched_url] assert len(l) == 1 c = l[0] - assert c['filename'].startswith('test_ydl_stitching') - assert c['content_type'] == 'video/mp4' - assert c['http_method'] == 'WARCPROX_WRITE_RECORD' + assert c["filename"].startswith("test_ydl_stitching") + assert c["content_type"] == "video/mp4" + assert c["http_method"] == "WARCPROX_WRITE_RECORD" diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 64f7ab53..760962d1 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" test_frontier.py - fairly narrow tests of frontier management, requires rethinkdb running on localhost @@ -16,7 +16,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import argparse import datetime @@ -32,102 +32,94 @@ args.log_level = logging.INFO brozzler.cli.configure_logging(args) + def test_rethinkdb_up(): - '''Checks that rethinkdb is listening and looks sane.''' - rr = doublethink.Rethinker(db='rethinkdb') # built-in db + """Checks that rethinkdb is listening and looks sane.""" + rr = doublethink.Rethinker(db="rethinkdb") # built-in db tbls = rr.table_list().run() assert len(tbls) > 10 + def test_basics(): - rr = doublethink.Rethinker(db='ignoreme') + rr = doublethink.Rethinker(db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - job_conf = {'seeds': [ - {'url': 'http://example.com'}, {'url': 'https://example.org/'}]} + job_conf = { + "seeds": [{"url": "http://example.com"}, {"url": "https://example.org/"}] + } job = brozzler.new_job(frontier, job_conf) assert job.id assert job.starts_and_stops - assert job.starts_and_stops[0]['start'] + assert job.starts_and_stops[0]["start"] assert job == { - 'id': job.id, - 'conf': { - 'seeds': [ - {'url': 'http://example.com'}, - {'url': 'https://example.org/'} - ] + "id": job.id, + "conf": { + "seeds": [{"url": "http://example.com"}, {"url": "https://example.org/"}] }, - 'status': 'ACTIVE', - 'starts_and_stops': [ - { - 'start': job.starts_and_stops[0]['start'], - 'stop': None - } - ] + "status": "ACTIVE", + "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}], } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) assert len(sites) == 2 - assert sites[0].starts_and_stops[0]['start'] - assert sites[1].starts_and_stops[0]['start'] + assert sites[0].starts_and_stops[0]["start"] + assert sites[1].starts_and_stops[0]["start"] assert sites[0] == { - 'claimed': False, - 'id': sites[0].id, - 'job_id': job.id, - 'last_claimed': brozzler.EPOCH_UTC, - 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]}, - 'seed': 'http://example.com', - 'starts_and_stops': [ - { - 'start': sites[0].starts_and_stops[0]['start'], - 'stop': None - } + "claimed": False, + "id": sites[0].id, + "job_id": job.id, + "last_claimed": brozzler.EPOCH_UTC, + "last_disclaimed": brozzler.EPOCH_UTC, + "scope": {"accepts": [{"ssurt": "com,example,//http:/"}]}, + "seed": "http://example.com", + "starts_and_stops": [ + {"start": sites[0].starts_and_stops[0]["start"], "stop": None} ], - 'status': 'ACTIVE' + "status": "ACTIVE", } assert sites[1] == { - 'claimed': False, - 'id': sites[1].id, - 'job_id': job.id, - 'last_claimed': brozzler.EPOCH_UTC, - 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]}, - 'seed': 'https://example.org/', - 'starts_and_stops': [ + "claimed": False, + "id": sites[1].id, + "job_id": job.id, + "last_claimed": brozzler.EPOCH_UTC, + "last_disclaimed": brozzler.EPOCH_UTC, + "scope": {"accepts": [{"ssurt": "org,example,//https:/"}]}, + "seed": "https://example.org/", + "starts_and_stops": [ { - 'start': sites[1].starts_and_stops[0]['start'], - 'stop': None, - }, + "start": sites[1].starts_and_stops[0]["start"], + "stop": None, + }, ], - 'status': 'ACTIVE', + "status": "ACTIVE", } pages = list(frontier.site_pages(sites[0].id)) assert len(pages) == 1 assert pages[0] == { - 'brozzle_count': 0, - 'claimed': False, - 'hops_from_seed': 0, - 'hops_off': 0, - 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), - 'job_id': job.id, - 'needs_robots_check': True, - 'priority': 1000, - 'site_id': sites[0].id, - 'url': 'http://example.com', + "brozzle_count": 0, + "claimed": False, + "hops_from_seed": 0, + "hops_off": 0, + "id": brozzler.Page.compute_id(sites[0].id, "http://example.com"), + "job_id": job.id, + "needs_robots_check": True, + "priority": 1000, + "site_id": sites[0].id, + "url": "http://example.com", } pages = list(frontier.site_pages(sites[1].id)) assert len(pages) == 1 assert pages[0] == { - 'brozzle_count': 0, - 'claimed': False, - 'hops_from_seed': 0, - 'hops_off': 0, - 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), - 'job_id': job.id, - 'needs_robots_check': True, - 'priority': 1000, - 'site_id': sites[1].id, - 'url': 'https://example.org/', + "brozzle_count": 0, + "claimed": False, + "hops_from_seed": 0, + "hops_off": 0, + "id": brozzler.Page.compute_id(sites[1].id, "https://example.org/"), + "job_id": job.id, + "needs_robots_check": True, + "priority": 1000, + "site_id": sites[1].id, + "url": "https://example.org/", } # test "brozzled" parameter of frontier.site_pages @@ -145,109 +137,112 @@ def test_basics(): assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 + def test_resume_job(): - ''' + """ Tests that the right stuff gets twiddled in rethinkdb when we "start" and "finish" crawling a job. Doesn't actually crawl anything. - ''' + """ # vagrant brozzler-worker isn't configured to look at the "ignoreme" db - rr = doublethink.Rethinker(db='ignoreme') + rr = doublethink.Rethinker(db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - job_conf = {'seeds': [{'url': 'http://example.com/'}]} + job_conf = {"seeds": [{"url": "http://example.com/"}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 1 site = list(frontier.job_sites(job.id))[0] - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert len(job.starts_and_stops) == 1 - assert job.starts_and_stops[0]['start'] - assert job.starts_and_stops[0]['stop'] is None - assert site.status == 'ACTIVE' + assert job.starts_and_stops[0]["start"] + assert job.starts_and_stops[0]["stop"] is None + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] - assert site.starts_and_stops[0]['stop'] is None + assert site.starts_and_stops[0]["start"] + assert site.starts_and_stops[0]["stop"] is None - frontier.finished(site, 'FINISHED') + frontier.finished(site, "FINISHED") job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" assert len(job.starts_and_stops) == 1 - assert job.starts_and_stops[0]['start'] - assert job.starts_and_stops[0]['stop'] - assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] - assert site.status == 'FINISHED' + assert job.starts_and_stops[0]["start"] + assert job.starts_and_stops[0]["stop"] + assert job.starts_and_stops[0]["stop"] > job.starts_and_stops[0]["start"] + assert site.status == "FINISHED" assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] - assert site.starts_and_stops[0]['stop'] - assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] + assert site.starts_and_stops[0]["start"] + assert site.starts_and_stops[0]["stop"] + assert site.starts_and_stops[0]["stop"] > site.starts_and_stops[0]["start"] frontier.resume_site(site) job.refresh() - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert len(job.starts_and_stops) == 2 - assert job.starts_and_stops[1]['start'] - assert job.starts_and_stops[1]['stop'] is None - assert site.status == 'ACTIVE' + assert job.starts_and_stops[1]["start"] + assert job.starts_and_stops[1]["stop"] is None + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] is None + assert site.starts_and_stops[1]["start"] + assert site.starts_and_stops[1]["stop"] is None - frontier.finished(site, 'FINISHED') + frontier.finished(site, "FINISHED") job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" assert len(job.starts_and_stops) == 2 - assert job.starts_and_stops[1]['start'] - assert job.starts_and_stops[1]['stop'] - assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start'] - assert site.status == 'FINISHED' + assert job.starts_and_stops[1]["start"] + assert job.starts_and_stops[1]["stop"] + assert job.starts_and_stops[1]["stop"] > job.starts_and_stops[1]["start"] + assert site.status == "FINISHED" assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] - assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] + assert site.starts_and_stops[1]["start"] + assert site.starts_and_stops[1]["stop"] + assert site.starts_and_stops[1]["stop"] > site.starts_and_stops[1]["start"] # resuming a job == resuming all of its sites frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert len(job.starts_and_stops) == 3 - assert job.starts_and_stops[2]['start'] - assert job.starts_and_stops[2]['stop'] is None - assert site.status == 'ACTIVE' + assert job.starts_and_stops[2]["start"] + assert job.starts_and_stops[2]["stop"] is None + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 3 - assert site.starts_and_stops[2]['start'] - assert site.starts_and_stops[2]['stop'] is None + assert site.starts_and_stops[2]["start"] + assert site.starts_and_stops[2]["stop"] is None - frontier.finished(site, 'FINISHED') + frontier.finished(site, "FINISHED") job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" assert len(job.starts_and_stops) == 3 - assert job.starts_and_stops[2]['start'] - assert job.starts_and_stops[2]['stop'] - assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start'] - assert site.status == 'FINISHED' + assert job.starts_and_stops[2]["start"] + assert job.starts_and_stops[2]["stop"] + assert job.starts_and_stops[2]["stop"] > job.starts_and_stops[2]["start"] + assert site.status == "FINISHED" assert len(site.starts_and_stops) == 3 - assert site.starts_and_stops[2]['start'] - assert site.starts_and_stops[2]['stop'] - assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start'] + assert site.starts_and_stops[2]["start"] + assert site.starts_and_stops[2]["stop"] + assert site.starts_and_stops[2]["stop"] > site.starts_and_stops[2]["start"] frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert len(job.starts_and_stops) == 4 - assert job.starts_and_stops[3]['start'] - assert job.starts_and_stops[3]['stop'] is None - assert site.status == 'ACTIVE' + assert job.starts_and_stops[3]["start"] + assert job.starts_and_stops[3]["stop"] is None + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 4 - assert site.starts_and_stops[3]['start'] - assert site.starts_and_stops[3]['stop'] is None + assert site.starts_and_stops[3]["start"] + assert site.starts_and_stops[3]["stop"] is None # simulate a job stop request - job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]} + job_conf = { + "seeds": [{"url": "http://example.com/"}, {"url": "http://example_2.com/"}] + } job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 2 site1 = list(frontier.job_sites(job.id))[0] @@ -260,45 +255,45 @@ def test_resume_job(): with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site1) - frontier.finished(site1, 'FINISHED_STOP_REQUESTED') - frontier.finished(site2, 'FINISHED_STOP_REQUESTED') + frontier.finished(site1, "FINISHED_STOP_REQUESTED") + frontier.finished(site2, "FINISHED_STOP_REQUESTED") job.refresh() - assert job.status == 'FINISHED' + assert job.status == "FINISHED" assert job.stop_requested assert len(job.starts_and_stops) == 1 - assert job.starts_and_stops[0]['start'] - assert job.starts_and_stops[0]['stop'] - assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] - assert site1.status == 'FINISHED_STOP_REQUESTED' - assert site2.status == 'FINISHED_STOP_REQUESTED' + assert job.starts_and_stops[0]["start"] + assert job.starts_and_stops[0]["stop"] + assert job.starts_and_stops[0]["stop"] > job.starts_and_stops[0]["start"] + assert site1.status == "FINISHED_STOP_REQUESTED" + assert site2.status == "FINISHED_STOP_REQUESTED" assert len(site1.starts_and_stops) == 1 assert len(site2.starts_and_stops) == 1 - assert site1.starts_and_stops[0]['start'] - assert site1.starts_and_stops[0]['stop'] - assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] - assert site2.starts_and_stops[0]['start'] - assert site2.starts_and_stops[0]['stop'] - assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] + assert site1.starts_and_stops[0]["start"] + assert site1.starts_and_stops[0]["stop"] + assert site1.starts_and_stops[0]["stop"] > site.starts_and_stops[0]["start"] + assert site2.starts_and_stops[0]["start"] + assert site2.starts_and_stops[0]["stop"] + assert site2.starts_and_stops[0]["stop"] > site.starts_and_stops[0]["start"] # simulate job resume after a stop request frontier.resume_job(job) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert job.stop_requested is None assert len(job.starts_and_stops) == 2 - assert job.starts_and_stops[1]['start'] - assert job.starts_and_stops[1]['stop'] is None - assert site1.status == 'ACTIVE' + assert job.starts_and_stops[1]["start"] + assert job.starts_and_stops[1]["stop"] is None + assert site1.status == "ACTIVE" assert len(site1.starts_and_stops) == 2 - assert site1.starts_and_stops[1]['start'] - assert site1.starts_and_stops[1]['stop'] is None - assert site2.status == 'ACTIVE' + assert site1.starts_and_stops[1]["start"] + assert site1.starts_and_stops[1]["stop"] is None + assert site2.status == "ACTIVE" assert len(site2.starts_and_stops) == 2 - assert site2.starts_and_stops[1]['start'] - assert site2.starts_and_stops[1]['stop'] is None + assert site2.starts_and_stops[1]["start"] + assert site2.starts_and_stops[1]["stop"] is None # simulate a site stop request site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) @@ -307,74 +302,75 @@ def test_resume_job(): # should not raise a CrawlStopped frontier.honor_stop_request(site2) - frontier.finished(site1, 'FINISHED_STOP_REQUESTED') + frontier.finished(site1, "FINISHED_STOP_REQUESTED") job.refresh() - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert job.stop_requested is None assert len(job.starts_and_stops) == 2 - assert job.starts_and_stops[1]['start'] - assert job.starts_and_stops[1]['stop'] is None - assert site1.status == 'FINISHED_STOP_REQUESTED' + assert job.starts_and_stops[1]["start"] + assert job.starts_and_stops[1]["stop"] is None + assert site1.status == "FINISHED_STOP_REQUESTED" assert len(site1.starts_and_stops) == 2 - assert site1.starts_and_stops[1]['start'] - assert site1.starts_and_stops[1]['stop'] - assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] - assert site2.status == 'ACTIVE' + assert site1.starts_and_stops[1]["start"] + assert site1.starts_and_stops[1]["stop"] + assert site1.starts_and_stops[1]["stop"] > site.starts_and_stops[1]["start"] + assert site2.status == "ACTIVE" assert len(site2.starts_and_stops) == 2 - assert site2.starts_and_stops[1]['start'] - assert site2.starts_and_stops[1]['stop'] is None + assert site2.starts_and_stops[1]["start"] + assert site2.starts_and_stops[1]["stop"] is None # simulate site resume after a stop request frontier.resume_site(site1) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] - assert job.status == 'ACTIVE' + assert job.status == "ACTIVE" assert job.stop_requested is None assert len(job.starts_and_stops) == 2 - assert job.starts_and_stops[1]['start'] - assert job.starts_and_stops[1]['stop'] is None - assert site1.status == 'ACTIVE' + assert job.starts_and_stops[1]["start"] + assert job.starts_and_stops[1]["stop"] is None + assert site1.status == "ACTIVE" assert site1.stop_requested is None assert len(site1.starts_and_stops) == 3 - assert site1.starts_and_stops[2]['start'] - assert site1.starts_and_stops[2]['stop'] is None - assert site2.status == 'ACTIVE' + assert site1.starts_and_stops[2]["start"] + assert site1.starts_and_stops[2]["stop"] is None + assert site2.status == "ACTIVE" assert len(site2.starts_and_stops) == 2 - assert site2.starts_and_stops[1]['start'] - assert site2.starts_and_stops[1]['stop'] is None + assert site2.starts_and_stops[1]["start"] + assert site2.starts_and_stops[1]["stop"] is None + def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999}) + site = brozzler.Site(rr, {"seed": "http://example.com/", "time_limit": 99999}) brozzler.new_site(frontier, site) site.refresh() # get it back from the db - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] - assert site.starts_and_stops[0]['stop'] is None + assert site.starts_and_stops[0]["start"] + assert site.starts_and_stops[0]["stop"] is None - frontier.finished(site, 'FINISHED') + frontier.finished(site, "FINISHED") - assert site.status == 'FINISHED' + assert site.status == "FINISHED" assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] - assert site.starts_and_stops[0]['stop'] - assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] + assert site.starts_and_stops[0]["start"] + assert site.starts_and_stops[0]["stop"] + assert site.starts_and_stops[0]["stop"] > site.starts_and_stops[0]["start"] frontier.resume_site(site) - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] is None + assert site.starts_and_stops[1]["start"] + assert site.starts_and_stops[1]["stop"] is None # no time limit set frontier.enforce_time_limit(site) @@ -385,10 +381,10 @@ def test_time_limit(): # time limit not reached yet frontier.enforce_time_limit(site) - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] is None + assert site.starts_and_stops[1]["start"] + assert site.starts_and_stops[1]["stop"] is None site.time_limit = 0.1 time.sleep(0.1) @@ -396,12 +392,13 @@ def test_time_limit(): with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site) + def test_field_defaults(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") # page brozzler.Page.table_ensure(rr) - page = brozzler.Page(rr, {'hops_from_seed': 3}) + page = brozzler.Page(rr, {"hops_from_seed": 3}) assert page.hops_from_seed == 3 assert page.id assert page.brozzle_count == 0 @@ -425,9 +422,9 @@ def test_field_defaults(): # site brozzler.Site.table_ensure(rr) - site = brozzler.Site(rr, {'seed': 'http://example.com/'}) + site = brozzler.Site(rr, {"seed": "http://example.com/"}) assert site.id is None - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/"}]} site.save() assert site.id assert site.scope @@ -444,41 +441,44 @@ def test_field_defaults(): # job brozzler.Job.table_ensure(rr) - job = brozzler.Job(rr, {'status': 'WHUUUT'}) - assert job.status == 'WHUUUT' + job = brozzler.Job(rr, {"status": "WHUUUT"}) + assert job.status == "WHUUUT" assert job.id is None assert job.starts_and_stops job.save() - assert job.status == 'WHUUUT' + assert job.status == "WHUUUT" assert job.id assert job.starts_and_stops kob = brozzler.Job.load(rr, job.id) - assert kob.status == 'WHUUUT' + assert kob.status == "WHUUUT" assert kob.id assert kob.starts_and_stops kob.save() - assert kob.status == 'WHUUUT' + assert kob.status == "WHUUUT" assert kob.id assert kob.starts_and_stops kob.refresh() - assert kob.status == 'WHUUUT' + assert kob.status == "WHUUUT" assert kob.id assert kob.starts_and_stops + def test_scope_and_schedule_outlinks(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, {'seed':'http://example.com/'}) - parent_page = brozzler.Page(rr, { - 'hops_from_seed': 1, 'url': 'http://example.com/whatever'}) + site = brozzler.Site(rr, {"seed": "http://example.com/"}) + parent_page = brozzler.Page( + rr, {"hops_from_seed": 1, "url": "http://example.com/whatever"} + ) outlinks = [ - 'https://example.com/', - 'https://example.com/foo', - 'http://example.com/bar', - 'HTtp://exAMPle.COm/bar', - 'HTtp://exAMPle.COm/BAr', - 'HTtp://exAMPle.COm/BAZZZZ',] + "https://example.com/", + "https://example.com/foo", + "http://example.com/bar", + "HTtp://exAMPle.COm/bar", + "HTtp://exAMPle.COm/BAr", + "HTtp://exAMPle.COm/BAZZZZ", + ] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: @@ -486,150 +486,176 @@ def test_scope_and_schedule_outlinks(): finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert sorted(parent_page.outlinks['rejected']) == [ - 'https://example.com/', 'https://example.com/foo'] - assert sorted(parent_page.outlinks['accepted']) == [ - 'http://example.com/BAZZZZ', 'http://example.com/BAr', - 'http://example.com/bar'] - assert parent_page.outlinks['blocked'] == [] + assert sorted(parent_page.outlinks["rejected"]) == [ + "https://example.com/", + "https://example.com/foo", + ] + assert sorted(parent_page.outlinks["accepted"]) == [ + "http://example.com/BAZZZZ", + "http://example.com/BAr", + "http://example.com/bar", + ] + assert parent_page.outlinks["blocked"] == [] pp = brozzler.Page.load(rr, parent_page.id) assert pp == parent_page - for url in parent_page.outlinks['rejected']: + for url in parent_page.outlinks["rejected"]: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) is None - for url in parent_page.outlinks['accepted']: + for url in parent_page.outlinks["accepted"]: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) + def test_parent_url_scoping(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) # scope rules that look at parent page url should consider both the # original url and the redirect url, if any, of the parent page - site = brozzler.Site(rr, { - 'seed': 'http://example.com/foo/', - 'scope': { - 'accepts': [{ - 'parent_url_regex': '^http://example.com/acceptme/.*$'}], - 'blocks': [{ - 'parent_url_regex': '^http://example.com/blockme/.*$'}], + site = brozzler.Site( + rr, + { + "seed": "http://example.com/foo/", + "scope": { + "accepts": [{"parent_url_regex": "^http://example.com/acceptme/.*$"}], + "blocks": [{"parent_url_regex": "^http://example.com/blockme/.*$"}], }, - 'remember_outlinks': True}) + "remember_outlinks": True, + }, + ) site.save() # an outlink that would not otherwise be in scope - outlinks = ['https://some-random-url.com/'] + outlinks = ["https://some-random-url.com/"] # parent page does not match any parent_url_regex - parent_page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/foo/spluh'}) + parent_page = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/foo/spluh"} + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == outlinks - assert parent_page.outlinks['accepted'] == [] + assert parent_page.outlinks["rejected"] == outlinks + assert parent_page.outlinks["accepted"] == [] # parent page url matches accept parent_url_regex - parent_page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/acceptme/futz'}) + parent_page = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/acceptme/futz"} + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == [] - assert parent_page.outlinks['accepted'] == outlinks + assert parent_page.outlinks["rejected"] == [] + assert parent_page.outlinks["accepted"] == outlinks # parent page redirect_url matches accept parent_url_regex - parent_page_c = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/toot/blah', - 'redirect_url':'http://example.com/acceptme/futz'}) + parent_page_c = brozzler.Page( + rr, + { + "site_id": site.id, + "url": "http://example.com/toot/blah", + "redirect_url": "http://example.com/acceptme/futz", + }, + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == [] - assert parent_page.outlinks['accepted'] == outlinks + assert parent_page.outlinks["rejected"] == [] + assert parent_page.outlinks["accepted"] == outlinks # an outlink that would normally be in scope - outlinks = ['http://example.com/foo/whatever/'] + outlinks = ["http://example.com/foo/whatever/"] # parent page does not match any parent_url_regex - parent_page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/foo/spluh'}) + parent_page = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/foo/spluh"} + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == [] - assert parent_page.outlinks['accepted'] == outlinks + assert parent_page.outlinks["rejected"] == [] + assert parent_page.outlinks["accepted"] == outlinks # parent page url matches block parent_url_regex - parent_page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/blockme/futz'}) + parent_page = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/blockme/futz"} + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == outlinks - assert parent_page.outlinks['accepted'] == [] + assert parent_page.outlinks["rejected"] == outlinks + assert parent_page.outlinks["accepted"] == [] # parent page redirect_url matches block parent_url_regex - parent_page_c = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/toot/blah', - 'redirect_url':'http://example.com/blockme/futz'}) + parent_page_c = brozzler.Page( + rr, + { + "site_id": site.id, + "url": "http://example.com/toot/blah", + "redirect_url": "http://example.com/blockme/futz", + }, + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots - assert parent_page.outlinks['rejected'] == outlinks - assert parent_page.outlinks['accepted'] == [] + assert parent_page.outlinks["rejected"] == outlinks + assert parent_page.outlinks["accepted"] == [] + def test_completed_page(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) # redirect that changes scope surt - site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) + site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) site.save() - page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/a/', - 'claimed': True, - 'brozzle_count': 0, - 'hops_from_seed': 0, - 'redirect_url':'http://example.com/b/', }) + page = brozzler.Page( + rr, + { + "site_id": site.id, + "url": "http://example.com/a/", + "claimed": True, + "brozzle_count": 0, + "hops_from_seed": 0, + "redirect_url": "http://example.com/b/", + }, + ) page.save() - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} frontier.completed_page(site, page) - assert site.scope == {'accepts': [ - {'ssurt': 'com,example,//http:/a/'}, - {'ssurt': 'com,example,//http:/b/'}]} + assert site.scope == { + "accepts": [ + {"ssurt": "com,example,//http:/a/"}, + {"ssurt": "com,example,//http:/b/"}, + ] + } site.refresh() - assert site.scope == {'accepts': [ - {'ssurt': 'com,example,//http:/a/'}, - {'ssurt': 'com,example,//http:/b/'}]} + assert site.scope == { + "accepts": [ + {"ssurt": "com,example,//http:/a/"}, + {"ssurt": "com,example,//http:/b/"}, + ] + } assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -638,21 +664,25 @@ def test_completed_page(): # redirect that doesn't change scope surt because destination is covered by # the original surt - site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) + site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) site.save() - page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/a/', - 'claimed': True, - 'brozzle_count': 0, - 'hops_from_seed': 0, - 'redirect_url':'http://example.com/a/x/', }) + page = brozzler.Page( + rr, + { + "site_id": site.id, + "url": "http://example.com/a/", + "claimed": True, + "brozzle_count": 0, + "hops_from_seed": 0, + "redirect_url": "http://example.com/a/x/", + }, + ) page.save() - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} frontier.completed_page(site, page) - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} site.refresh() - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -660,121 +690,133 @@ def test_completed_page(): assert page.claimed == False # redirect that doesn't change scope surt because page is not the seed page - site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) + site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) site.save() - page = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/c/', - 'claimed': True, - 'brozzle_count': 0, - 'hops_from_seed': 1, - 'redirect_url':'http://example.com/d/', }) + page = brozzler.Page( + rr, + { + "site_id": site.id, + "url": "http://example.com/c/", + "claimed": True, + "brozzle_count": 0, + "hops_from_seed": 1, + "redirect_url": "http://example.com/d/", + }, + ) page.save() - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} frontier.completed_page(site, page) - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} site.refresh() - assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} + assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False + def test_seed_page(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) + site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) site.save() assert frontier.seed_page(site.id) is None - page1 = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/a/b/', - 'hops_from_seed': 1}) + page1 = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/a/b/", "hops_from_seed": 1} + ) page1.save() assert frontier.seed_page(site.id) is None - page0 = brozzler.Page(rr, { - 'site_id': site.id, - 'url': 'http://example.com/a/', - 'hops_from_seed': 0}) + page0 = brozzler.Page( + rr, {"site_id": site.id, "url": "http://example.com/a/", "hops_from_seed": 0} + ) page0.save() assert frontier.seed_page(site.id) == page0 + def test_hashtag_seed(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) # no hash tag - site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + site = brozzler.Site(rr, {"seed": "http://example.org/"}) brozzler.new_site(frontier, site) - assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} + assert site.scope == {"accepts": [{"ssurt": "org,example,//http:/"}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 - assert pages[0].url == 'http://example.org/' + assert pages[0].url == "http://example.org/" assert not pages[0].hashtags # yes hash tag - site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) + site = brozzler.Site(rr, {"seed": "http://example.org/#hash"}) brozzler.new_site(frontier, site) - assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} + assert site.scope == {"accepts": [{"ssurt": "org,example,//http:/"}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 - assert pages[0].url == 'http://example.org/' - assert pages[0].hashtags == ['#hash',] + assert pages[0].url == "http://example.org/" + assert pages[0].hashtags == [ + "#hash", + ] + def test_hashtag_links(): - rr = doublethink.Rethinker('localhost', db='test_hashtag_links') + rr = doublethink.Rethinker("localhost", db="test_hashtag_links") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + site = brozzler.Site(rr, {"seed": "http://example.org/"}) brozzler.new_site(frontier, site) parent_page = frontier.seed_page(site.id) assert not parent_page.hashtags outlinks = [ - 'http://example.org/#foo', - 'http://example.org/bar', - 'http://example.org/bar#baz', - 'http://example.org/bar#quux', - 'http://example.org/zuh#buh', + "http://example.org/#foo", + "http://example.org/bar", + "http://example.org/bar#baz", + "http://example.org/bar#quux", + "http://example.org/zuh#buh", ] frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 3 - assert pages[0].url == 'http://example.org/' - assert sorted(pages[0].outlinks['accepted']) == [ - 'http://example.org/', 'http://example.org/bar', - 'http://example.org/zuh'] - assert not pages[0].outlinks['blocked'] - assert not pages[0].outlinks['rejected'] - assert pages[0].hashtags == ['#foo',] + assert pages[0].url == "http://example.org/" + assert sorted(pages[0].outlinks["accepted"]) == [ + "http://example.org/", + "http://example.org/bar", + "http://example.org/zuh", + ] + assert not pages[0].outlinks["blocked"] + assert not pages[0].outlinks["rejected"] + assert pages[0].hashtags == [ + "#foo", + ] assert pages[0].hops_from_seed == 0 - assert pages[1].url == 'http://example.org/bar' - assert sorted(pages[1].hashtags) == ['#baz','#quux'] + assert pages[1].url == "http://example.org/bar" + assert sorted(pages[1].hashtags) == ["#baz", "#quux"] assert pages[1].priority == 36 assert pages[1].hops_from_seed == 1 - assert pages[2].url == 'http://example.org/zuh' - assert pages[2].hashtags == ['#buh'] + assert pages[2].url == "http://example.org/zuh" + assert pages[2].hashtags == ["#buh"] assert pages[2].priority == 12 + def test_honor_stop_request(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) # 1. test stop request on job - job_conf = {'seeds': [{'url': 'http://example.com'}]} + job_conf = {"seeds": [{"url": "http://example.com"}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) @@ -786,14 +828,13 @@ def test_honor_stop_request(): frontier.honor_stop_request(site) # set job.stop_requested - job.stop_requested = datetime.datetime.utcnow().replace( - tzinfo=doublethink.UTC) + job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) job.save() with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site) # 2. test stop request on site - job_conf = {'seeds': [{'url': 'http://example.com'}]} + job_conf = {"seeds": [{"url": "http://example.com"}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) @@ -810,16 +851,17 @@ def test_honor_stop_request(): with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site) + def test_claim_site(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - rr.table('sites').delete().run() # clean slate + rr.table("sites").delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() - site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + site = brozzler.Site(rr, {"seed": "http://example.org/"}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() @@ -827,7 +869,9 @@ def test_claim_site(): claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed - assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) + assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta( + minutes=1 + ) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() @@ -848,27 +892,28 @@ def test_claim_site(): assert claimed_site.id == site.id # clean up - rr.table('sites').get(claimed_site.id).delete().run() + rr.table("sites").get(claimed_site.id).delete().run() + def test_max_claimed_sites(): # max_claimed_sites is a brozzler job setting that puts a cap on the number # of the job's sites that can be brozzled simultaneously across the cluster - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) # clean slate - rr.table('jobs').delete().run() - rr.table('sites').delete().run() + rr.table("jobs").delete().run() + rr.table("sites").delete().run() job_conf = { - 'seeds': [ - {'url': 'http://example.com/1'}, - {'url': 'http://example.com/2'}, - {'url': 'http://example.com/3'}, - {'url': 'http://example.com/4'}, - {'url': 'http://example.com/5'}, + "seeds": [ + {"url": "http://example.com/1"}, + {"url": "http://example.com/2"}, + {"url": "http://example.com/3"}, + {"url": "http://example.com/4"}, + {"url": "http://example.com/5"}, ], - 'max_claimed_sites': 3, + "max_claimed_sites": 3, } job = brozzler.new_job(frontier, job_conf) @@ -887,214 +932,274 @@ def test_max_claimed_sites(): claimed_site = frontier.claim_sites(3) # clean slate for the next one - rr.table('jobs').delete().run() - rr.table('sites').delete().run() + rr.table("jobs").delete().run() + rr.table("sites").delete().run() + def test_choose_warcprox(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 - rr.table('sites').wait().run() - rr.table('services').wait().run() - rr.table('sites').index_wait().run() - rr.table('services').index_wait().run() + rr.table("sites").wait().run() + rr.table("services").wait().run() + rr.table("sites").index_wait().run() + rr.table("services").index_wait().run() # clean slate - rr.table('sites').delete().run() - rr.table('services').delete().run() + rr.table("sites").delete().run() + rr.table("services").delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None - rr.table('services').insert({ - 'role': 'warcprox', - 'first_heartbeat': doublethink.utcnow(), - 'last_heartbeat': doublethink.utcnow(), - 'host': 'host1', 'port': 8000, - 'load': 0, 'ttl': 60}).run() - rr.table('services').insert({ - 'role': 'warcprox', - 'first_heartbeat': doublethink.utcnow(), - 'last_heartbeat': doublethink.utcnow(), - 'host': 'host2', 'port': 8000, - 'load': 0, 'ttl': 60}).run() - rr.table('services').insert({ - 'role': 'warcprox', - 'first_heartbeat': doublethink.utcnow(), - 'last_heartbeat': doublethink.utcnow(), - 'host': 'host2', 'port': 8001, - 'load': 0, 'ttl': 60}).run() - rr.table('services').insert({ - 'role': 'warcprox', - 'first_heartbeat': doublethink.utcnow(), - 'last_heartbeat': doublethink.utcnow(), - 'host': 'host3', 'port': 8000, - 'load': 0, 'ttl': 60}).run() - rr.table('services').insert({ - 'role': 'warcprox', - 'first_heartbeat': doublethink.utcnow(), - 'last_heartbeat': doublethink.utcnow(), - 'host': 'host4', 'port': 8000, - 'load': 1, 'ttl': 60}).run() - - rr.table('sites').insert({ - 'proxy': 'host1:8000', 'status': 'ACTIVE', - 'last_disclaimed': doublethink.utcnow()}).run() - rr.table('sites').insert({ - 'proxy': 'host1:8000', 'status': 'ACTIVE', - 'last_disclaimed': doublethink.utcnow()}).run() - rr.table('sites').insert({ - 'proxy': 'host2:8000', 'status': 'ACTIVE', - 'last_disclaimed': doublethink.utcnow()}).run() - rr.table('sites').insert({ - 'proxy': 'host2:8001', 'status': 'ACTIVE', - 'last_disclaimed': doublethink.utcnow()}).run() + rr.table("services").insert( + { + "role": "warcprox", + "first_heartbeat": doublethink.utcnow(), + "last_heartbeat": doublethink.utcnow(), + "host": "host1", + "port": 8000, + "load": 0, + "ttl": 60, + } + ).run() + rr.table("services").insert( + { + "role": "warcprox", + "first_heartbeat": doublethink.utcnow(), + "last_heartbeat": doublethink.utcnow(), + "host": "host2", + "port": 8000, + "load": 0, + "ttl": 60, + } + ).run() + rr.table("services").insert( + { + "role": "warcprox", + "first_heartbeat": doublethink.utcnow(), + "last_heartbeat": doublethink.utcnow(), + "host": "host2", + "port": 8001, + "load": 0, + "ttl": 60, + } + ).run() + rr.table("services").insert( + { + "role": "warcprox", + "first_heartbeat": doublethink.utcnow(), + "last_heartbeat": doublethink.utcnow(), + "host": "host3", + "port": 8000, + "load": 0, + "ttl": 60, + } + ).run() + rr.table("services").insert( + { + "role": "warcprox", + "first_heartbeat": doublethink.utcnow(), + "last_heartbeat": doublethink.utcnow(), + "host": "host4", + "port": 8000, + "load": 1, + "ttl": 60, + } + ).run() + + rr.table("sites").insert( + { + "proxy": "host1:8000", + "status": "ACTIVE", + "last_disclaimed": doublethink.utcnow(), + } + ).run() + rr.table("sites").insert( + { + "proxy": "host1:8000", + "status": "ACTIVE", + "last_disclaimed": doublethink.utcnow(), + } + ).run() + rr.table("sites").insert( + { + "proxy": "host2:8000", + "status": "ACTIVE", + "last_disclaimed": doublethink.utcnow(), + } + ).run() + rr.table("sites").insert( + { + "proxy": "host2:8001", + "status": "ACTIVE", + "last_disclaimed": doublethink.utcnow(), + } + ).run() instance = worker._choose_warcprox() - assert instance['host'] == 'host3' - assert instance['port'] == 8000 - rr.table('sites').insert({ - 'proxy': 'host3:8000', 'status': 'ACTIVE', - 'last_disclaimed': doublethink.utcnow()}).run() + assert instance["host"] == "host3" + assert instance["port"] == 8000 + rr.table("sites").insert( + { + "proxy": "host3:8000", + "status": "ACTIVE", + "last_disclaimed": doublethink.utcnow(), + } + ).run() instance = worker._choose_warcprox() - assert instance['host'] == 'host4' - assert instance['port'] == 8000 + assert instance["host"] == "host4" + assert instance["port"] == 8000 # clean up - rr.table('sites').delete().run() - rr.table('services').delete().run() + rr.table("sites").delete().run() + rr.table("services").delete().run() + def test_max_hops_off(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker("localhost", db="ignoreme") frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, { - 'seed': 'http://example.com/', - 'scope': { - 'max_hops_off_surt': 1, - 'blocks': [{'ssurt': 'domain,bad,'}]}}) + site = brozzler.Site( + rr, + { + "seed": "http://example.com/", + "scope": {"max_hops_off_surt": 1, "blocks": [{"ssurt": "domain,bad,"}]}, + }, + ) brozzler.new_site(frontier, site) site.refresh() # get it back from the db # renamed this param - assert not 'max_hops_off_surt' in site.scope - assert site.scope['max_hops_off'] == 1 + assert not "max_hops_off_surt" in site.scope + assert site.scope["max_hops_off"] == 1 seed_page = frontier.seed_page(site.id) - assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None - assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None - assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True - assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False + assert site.accept_reject_or_neither("http://foo.org/", seed_page) is None + assert site.accept_reject_or_neither("https://example.com/toot", seed_page) is None + assert site.accept_reject_or_neither("http://example.com/toot", seed_page) is True + assert ( + site.accept_reject_or_neither("https://some.bad.domain/something", seed_page) + is False + ) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: # two of these are in scope because of max_hops_off - frontier.scope_and_schedule_outlinks(site, seed_page, [ - 'http://foo.org/', 'https://example.com/toot', - 'http://example.com/toot', 'https://some.bad.domain/something']) + frontier.scope_and_schedule_outlinks( + site, + seed_page, + [ + "http://foo.org/", + "https://example.com/toot", + "http://example.com/toot", + "https://some.bad.domain/something", + ], + ) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 4 - assert pages[0].url == 'http://example.com/' + assert pages[0].url == "http://example.com/" assert pages[0].hops_off == 0 - assert not 'hops_off_surt' in pages[0] - assert set(pages[0].outlinks['accepted']) == { - 'https://example.com/toot', 'http://foo.org/', - 'http://example.com/toot'} - assert pages[0].outlinks['blocked'] == [] - assert pages[0].outlinks['rejected'] == [ - 'https://some.bad.domain/something'] + assert not "hops_off_surt" in pages[0] + assert set(pages[0].outlinks["accepted"]) == { + "https://example.com/toot", + "http://foo.org/", + "http://example.com/toot", + } + assert pages[0].outlinks["blocked"] == [] + assert pages[0].outlinks["rejected"] == ["https://some.bad.domain/something"] assert { - 'brozzle_count': 0, - 'claimed': False, - 'hashtags': [], - 'hops_from_seed': 1, - 'hops_off': 0, - 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), - 'job_id': None, - 'needs_robots_check': False, - 'priority': 12, - 'site_id': site.id, - 'url': 'http://example.com/toot', - 'via_page_id': seed_page.id + "brozzle_count": 0, + "claimed": False, + "hashtags": [], + "hops_from_seed": 1, + "hops_off": 0, + "id": brozzler.Page.compute_id(site.id, "http://example.com/toot"), + "job_id": None, + "needs_robots_check": False, + "priority": 12, + "site_id": site.id, + "url": "http://example.com/toot", + "via_page_id": seed_page.id, } in pages assert { - 'brozzle_count': 0, - 'claimed': False, - 'hashtags': [], - 'hops_from_seed': 1, - 'hops_off': 1, - 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), - 'job_id': None, - 'needs_robots_check': False, - 'priority': 12, - 'site_id': site.id, - 'url': 'http://foo.org/', - 'via_page_id': seed_page.id + "brozzle_count": 0, + "claimed": False, + "hashtags": [], + "hops_from_seed": 1, + "hops_off": 1, + "id": brozzler.Page.compute_id(site.id, "http://foo.org/"), + "job_id": None, + "needs_robots_check": False, + "priority": 12, + "site_id": site.id, + "url": "http://foo.org/", + "via_page_id": seed_page.id, } in pages assert { - 'brozzle_count': 0, - 'claimed': False, - 'hashtags': [], - 'hops_from_seed': 1, - 'hops_off': 1, - 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), - 'job_id': None, - 'needs_robots_check': False, - 'priority': 12, - 'site_id': site.id, - 'url': 'https://example.com/toot', - 'via_page_id': seed_page.id + "brozzle_count": 0, + "claimed": False, + "hashtags": [], + "hops_from_seed": 1, + "hops_off": 1, + "id": brozzler.Page.compute_id(site.id, "https://example.com/toot"), + "job_id": None, + "needs_robots_check": False, + "priority": 12, + "site_id": site.id, + "url": "https://example.com/toot", + "via_page_id": seed_page.id, } in pages # next hop is past max_hops_off, but normal in scope url is in scope - foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] + foo_page = [pg for pg in pages if pg.url == "http://foo.org/"][0] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: - frontier.scope_and_schedule_outlinks(site, foo_page, [ - 'http://foo.org/bar', 'http://example.com/blah']) + frontier.scope_and_schedule_outlinks( + site, foo_page, ["http://foo.org/bar", "http://example.com/blah"] + ) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert foo_page == { - 'brozzle_count': 0, - 'claimed': False, - 'hashtags': [], - 'hops_from_seed': 1, - 'hops_off': 1, - 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), - 'job_id': None, - 'needs_robots_check': False, - 'priority': 12, - 'site_id': site.id, - 'url': 'http://foo.org/', - 'via_page_id': seed_page.id, - 'outlinks': { - 'accepted': ['http://example.com/blah'], - 'blocked': [], - 'rejected': ['http://foo.org/bar'], - } + "brozzle_count": 0, + "claimed": False, + "hashtags": [], + "hops_from_seed": 1, + "hops_off": 1, + "id": brozzler.Page.compute_id(site.id, "http://foo.org/"), + "job_id": None, + "needs_robots_check": False, + "priority": 12, + "site_id": site.id, + "url": "http://foo.org/", + "via_page_id": seed_page.id, + "outlinks": { + "accepted": ["http://example.com/blah"], + "blocked": [], + "rejected": ["http://foo.org/bar"], + }, } pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 5 assert { - 'brozzle_count': 0, - 'claimed': False, - 'hashtags': [], - 'hops_from_seed': 2, - 'hops_off': 0, - 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), - 'job_id': None, - 'needs_robots_check': False, - 'priority': 11, - 'site_id': site.id, - 'url': 'http://example.com/blah', - 'via_page_id': foo_page.id + "brozzle_count": 0, + "claimed": False, + "hashtags": [], + "hops_from_seed": 2, + "hops_off": 0, + "id": brozzler.Page.compute_id(site.id, "http://example.com/blah"), + "job_id": None, + "needs_robots_check": False, + "priority": 11, + "site_id": site.id, + "url": "http://example.com/blah", + "via_page_id": foo_page.id, } in pages - diff --git a/tests/test_units.py b/tests/test_units.py index 43268672..b7a785ff 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" test_units.py - some unit tests for parts of brozzler amenable to that Copyright (C) 2016-2017 Internet Archive @@ -15,7 +15,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import pytest import http.server @@ -37,99 +37,131 @@ from unittest import mock logging.basicConfig( - stream=sys.stderr, level=logging.INFO, format=( - '%(asctime)s %(process)d %(levelname)s %(threadName)s ' - '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) + stream=sys.stderr, + level=logging.INFO, + format=( + "%(asctime)s %(process)d %(levelname)s %(threadName)s " + "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s" + ), +) -@pytest.fixture(scope='module') + +@pytest.fixture(scope="module") def httpd(request): # SimpleHTTPRequestHandler always uses CWD so we have to chdir - os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + os.chdir(os.path.join(os.path.dirname(__file__), "htdocs")) httpd = http.server.HTTPServer( - ('localhost', 0), http.server.SimpleHTTPRequestHandler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + ("localhost", 0), http.server.SimpleHTTPRequestHandler + ) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() def fin(): httpd.shutdown() httpd.server_close() httpd_thread.join() + request.addfinalizer(fin) return httpd + def test_robots(httpd): - ''' + """ Basic test of robots.txt user-agent substring matching. - ''' - url = 'http://localhost:%s/' % httpd.server_port - site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'}) + """ + url = "http://localhost:%s/" % httpd.server_port + site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"}) assert brozzler.is_permitted_by_robots(site, url) - site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) + site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"}) assert not brozzler.is_permitted_by_robots(site, url) + def test_robots_http_statuses(): for status in ( - 200, 204, 400, 401, 402, 403, 404, 405, - 500, 501, 502, 503, 504, 505): + 200, + 204, + 400, + 401, + 402, + 403, + 404, + 405, + 500, + 501, + 502, + 503, + 504, + 505, + ): + class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): - response = (('HTTP/1.1 %s Meaningless message\r\n' - + 'Content-length: 0\r\n' - + '\r\n') % status).encode('utf-8') + response = ( + ( + "HTTP/1.1 %s Meaningless message\r\n" + + "Content-length: 0\r\n" + + "\r\n" + ) + % status + ).encode("utf-8") self.connection.sendall(response) # self.send_response(status) # self.end_headers() - httpd = http.server.HTTPServer(('localhost', 0), Handler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + + httpd = http.server.HTTPServer(("localhost", 0), Handler) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() try: - url = 'http://localhost:%s/' % httpd.server_port - site = brozzler.Site(None, {'seed': url}) + url = "http://localhost:%s/" % httpd.server_port + site = brozzler.Site(None, {"seed": url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join() + def test_robots_empty_response(): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): self.connection.shutdown(socket.SHUT_RDWR) self.connection.close() - httpd = http.server.HTTPServer(('localhost', 0), Handler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + + httpd = http.server.HTTPServer(("localhost", 0), Handler) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() try: - url = 'http://localhost:%s/' % httpd.server_port - site = brozzler.Site(None, {'seed': url}) + url = "http://localhost:%s/" % httpd.server_port + site = brozzler.Site(None, {"seed": url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join() + def test_robots_socket_timeout(): stop_hanging = threading.Event() + class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): stop_hanging.wait(60) - self.connection.sendall( - b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n') + self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n") orig_timeout = brozzler.robots._SessionRaiseOn420.timeout - httpd = http.server.HTTPServer(('localhost', 0), Handler) - httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd = http.server.HTTPServer(("localhost", 0), Handler) + httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever) httpd_thread.start() try: - url = 'http://localhost:%s/' % httpd.server_port - site = brozzler.Site(None, {'seed': url}) + url = "http://localhost:%s/" % httpd.server_port + site = brozzler.Site(None, {"seed": url}) brozzler.robots._SessionRaiseOn420.timeout = 2 assert brozzler.is_permitted_by_robots(site, url) finally: @@ -139,20 +171,24 @@ def do_GET(self): httpd.server_close() httpd_thread.join() + def test_robots_dns_failure(): # .invalid. is guaranteed nonexistent per rfc 6761 - url = 'http://whatever.invalid./' - site = brozzler.Site(None, {'seed': url}) + url = "http://whatever.invalid./" + site = brozzler.Site(None, {"seed": url}) assert brozzler.is_permitted_by_robots(site, url) + def test_robots_connection_failure(): # .invalid. is guaranteed nonexistent per rfc 6761 - url = 'http://localhost:4/' # nobody listens on port 4 - site = brozzler.Site(None, {'seed': url}) + url = "http://localhost:4/" # nobody listens on port 4 + site = brozzler.Site(None, {"seed": url}) assert brozzler.is_permitted_by_robots(site, url) + def test_scoping(): - test_scope = yaml.safe_load(''' + test_scope = yaml.safe_load( + """ max_hops: 100 accepts: - url_match: REGEX_MATCH @@ -169,40 +205,73 @@ def test_scoping(): - domain: twitter.com url_match: REGEX_MATCH value: ^.*lang=(?!en).*$ -''') - - site = brozzler.Site(None, { - 'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey', - 'scope': test_scope}) - page = brozzler.Page(None, { - 'url': 'http://example.com/foo/bar?baz=quux#monkey', - 'site_id': site.id}) - - assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True - assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None - - assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None - assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True - - assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True - assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None +""" + ) + + site = brozzler.Site( + None, + { + "id": 1, + "seed": "http://example.com/foo/bar?baz=quux#monkey", + "scope": test_scope, + }, + ) + page = brozzler.Page( + None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id} + ) + + assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True + assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None + + assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None + assert ( + site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page) + is True + ) + + assert ( + site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True + ) + assert ( + site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None + ) + + assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True + assert ( + site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True + ) + assert ( + site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False + ) + + assert ( + site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True + ) + + assert ( + site.accept_reject_or_neither( + "https://www.youtube.com/watch?v=dUIn5OAPS5s", page + ) + is None + ) + yt_user_page = brozzler.Page( + None, + { + "url": "https://www.youtube.com/user/SonoraSantaneraVEVO", + "site_id": site.id, + "hops_from_seed": 10, + }, + ) + assert ( + site.accept_reject_or_neither( + "https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page + ) + is True + ) - assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True - assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True - assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False - - assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True - - assert site.accept_reject_or_neither( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None - yt_user_page = brozzler.Page(None, { - 'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO', - 'site_id': site.id, 'hops_from_seed': 10}) - assert site.accept_reject_or_neither( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True def test_proxy_down(): - ''' + """ Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than @@ -211,24 +280,24 @@ def test_proxy_down(): Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections - ''' + """ sock = socket.socket() - sock.bind(('127.0.0.1', 0)) - for not_listening_proxy in ( - '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): - worker = brozzler.BrozzlerWorker( - frontier=None, proxy=not_listening_proxy) - site = brozzler.Site(None, { - 'id': str(uuid.uuid4()), 'seed': 'http://example.com/'}) - page = brozzler.Page(None, {'url': 'http://example.com/'}) + sock.bind(("127.0.0.1", 0)) + for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]): + worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) + site = brozzler.Site( + None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"} + ) + page = brozzler.Page(None, {"url": "http://example.com/"}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots( - site, 'http://example.com/', proxy=not_listening_proxy) + site, "http://example.com/", proxy=not_listening_proxy + ) # youtube-dl fetch - with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: with pytest.raises(brozzler.ProxyError): brozzler.ydl.do_youtube_dl(worker, site, page) @@ -239,47 +308,58 @@ def test_proxy_down(): # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( - warcprox_address=not_listening_proxy, - url='test://proxy_down/warcprox_write_record', - warc_type='metadata', - content_type='text/plain', - payload=b'''payload doesn't matter here''') + warcprox_address=not_listening_proxy, + url="test://proxy_down/warcprox_write_record", + warc_type="metadata", + content_type="text/plain", + payload=b"""payload doesn't matter here""", + ) + def test_start_stop_backwards_compat(): - site = brozzler.Site(None, {'seed': 'http://example.com/'}) + site = brozzler.Site(None, {"seed": "http://example.com/"}) assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] - assert site.starts_and_stops[0]['stop'] is None - assert not 'start_time' in site - - site = brozzler.Site(None, { - 'seed': 'http://example.com/', - 'start_time': datetime.datetime(2017,1,1)}) + assert site.starts_and_stops[0]["start"] + assert site.starts_and_stops[0]["stop"] is None + assert not "start_time" in site + + site = brozzler.Site( + None, + {"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)}, + ) assert len(site.starts_and_stops) == 1 - assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1) - assert site.starts_and_stops[0]['stop'] is None - assert not 'start_time' in site - - job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]}) - assert job.starts_and_stops[0]['start'] - assert job.starts_and_stops[0]['stop'] is None - assert not 'started' in job - assert not 'finished' in job - - job = brozzler.Job(None, { - 'seeds': [{'url':'https://example.com/'}], - 'started': datetime.datetime(2017, 1, 1), - 'finished': datetime.datetime(2017, 1, 2)}) - assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1) - assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2) - assert not 'started' in job - assert not 'finished' in job + assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) + assert site.starts_and_stops[0]["stop"] is None + assert not "start_time" in site + + job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]}) + assert job.starts_and_stops[0]["start"] + assert job.starts_and_stops[0]["stop"] is None + assert not "started" in job + assert not "finished" in job + + job = brozzler.Job( + None, + { + "seeds": [{"url": "https://example.com/"}], + "started": datetime.datetime(2017, 1, 1), + "finished": datetime.datetime(2017, 1, 2), + }, + ) + assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) + assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2) + assert not "started" in job + assert not "finished" in job + class Exception1(Exception): pass + + class Exception2(Exception): pass + def test_thread_raise_not_accept(): def never_accept(): try: @@ -297,6 +377,7 @@ def never_accept(): th.join() assert thread_caught_exception is None + def test_thread_raise_immediate(): def accept_immediately(): try: @@ -317,13 +398,17 @@ def accept_immediately(): assert isinstance(thread_caught_exception, Exception1) assert time.time() - start < 1.0 + def test_thread_raise_safe_exit(): def delay_context_exit(): gate = brozzler.thread_accept_exceptions() orig_exit = type(gate).__exit__ try: type(gate).__exit__ = lambda self, et, ev, t: ( - brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1] + brozzler.sleep(2), + orig_exit(self, et, ev, t), + False, + )[-1] with brozzler.thread_accept_exceptions() as gate: brozzler.sleep(2) except Exception as e: @@ -345,6 +430,7 @@ def delay_context_exit(): assert thread_caught_exception assert isinstance(thread_caught_exception, Exception1) + def test_thread_raise_pending_exception(): def accept_eventually(): try: @@ -365,16 +451,17 @@ def accept_eventually(): assert isinstance(thread_caught_exception, Exception1) assert time.time() - start > 1.0 + def test_thread_raise_second_with_block(): def two_with_blocks(): try: with brozzler.thread_accept_exceptions(): time.sleep(2) - return # test fails + return # test fails except Exception1 as e: pass except: - return # fail test + return # fail test try: with brozzler.thread_accept_exceptions(): @@ -393,52 +480,79 @@ def two_with_blocks(): th.join() assert isinstance(thread_caught_exception, Exception2) + def test_needs_browsing(): # only one test case here right now, which exposed a bug class ConvenientHeaders(http.client.HTTPMessage): def __init__(self, headers): http.client.HTTPMessage.__init__(self) - for (k, v) in headers.items(): + for k, v in headers.items(): self.add_header(k, v) - page = brozzler.Page(None, { - 'url':'http://example.com/a'}) + page = brozzler.Page(None, {"url": "http://example.com/a"}) spy = brozzler.ydl.YoutubeDLSpy() - spy.fetches.append({ - 'url': 'http://example.com/a', - 'method': 'HEAD', - 'response_code': 301, - 'response_headers': ConvenientHeaders({'Location': '/b'})}) - spy.fetches.append({ - 'url': 'http://example.com/b', - 'method': 'GET', - 'response_code': 200, - 'response_headers': ConvenientHeaders({ - 'Content-Type': 'application/pdf'})}) - - assert not brozzler.worker.BrozzlerWorker._needs_browsing( - None, page, spy.fetches) + spy.fetches.append( + { + "url": "http://example.com/a", + "method": "HEAD", + "response_code": 301, + "response_headers": ConvenientHeaders({"Location": "/b"}), + } + ) + spy.fetches.append( + { + "url": "http://example.com/b", + "method": "GET", + "response_code": 200, + "response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}), + } + ) + + assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches) + def test_seed_redirect(): - site = brozzler.Site(None, {'seed': 'http://foo.com/'}) - site.note_seed_redirect('https://foo.com/a/b/c') - assert site.scope == {'accepts': [ - {'ssurt': 'com,foo,//http:/',}, - {'ssurt': 'com,foo,//https:/',}]} - - site = brozzler.Site(None, {'seed': 'https://foo.com/'}) - site.note_seed_redirect('http://foo.com/a/b/c') - assert site.scope == {'accepts': [ - {'ssurt': 'com,foo,//https:/',}, - {'ssurt': 'com,foo,//http:/',}]} - - site = brozzler.Site(None, {'seed': 'http://foo.com/'}) - site.note_seed_redirect('https://bar.com/a/b/c') - assert site.scope == {'accepts': [ - {'ssurt': 'com,foo,//http:/',}, - {'ssurt': 'com,bar,//https:/a/b/c',}]} + site = brozzler.Site(None, {"seed": "http://foo.com/"}) + site.note_seed_redirect("https://foo.com/a/b/c") + assert site.scope == { + "accepts": [ + { + "ssurt": "com,foo,//http:/", + }, + { + "ssurt": "com,foo,//https:/", + }, + ] + } + + site = brozzler.Site(None, {"seed": "https://foo.com/"}) + site.note_seed_redirect("http://foo.com/a/b/c") + assert site.scope == { + "accepts": [ + { + "ssurt": "com,foo,//https:/", + }, + { + "ssurt": "com,foo,//http:/", + }, + ] + } + + site = brozzler.Site(None, {"seed": "http://foo.com/"}) + site.note_seed_redirect("https://bar.com/a/b/c") + assert site.scope == { + "accepts": [ + { + "ssurt": "com,foo,//http:/", + }, + { + "ssurt": "com,bar,//https:/a/b/c", + }, + ] + } + def test_limit_failures(): page = mock.Mock() @@ -446,9 +560,9 @@ def test_limit_failures(): page.brozzle_count = 0 site = mock.Mock() - site.status = 'ACTIVE' + site.status = "ACTIVE" site.active_brozzling_time = 0 - site.starts_and_stops = [{'start':datetime.datetime.utcnow()}] + site.starts_and_stops = [{"start": datetime.datetime.utcnow()}] rr = mock.Mock() rr.servers = [mock.Mock()] @@ -456,11 +570,12 @@ def test_limit_failures(): rr.db_list = mock.Mock(return_value=rethink_query) rr.table_list = mock.Mock(return_value=rethink_query) rr.table = mock.Mock( - return_value=mock.Mock( - between=mock.Mock( - return_value=mock.Mock( - limit=mock.Mock( - return_value=rethink_query))))) + return_value=mock.Mock( + between=mock.Mock( + return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query)) + ) + ) + ) assert rr.table().between().limit().run() == [] frontier = brozzler.RethinkDbFrontier(rr) frontier.enforce_time_limit = mock.Mock() @@ -475,20 +590,19 @@ def test_limit_failures(): assert page.failed_attempts is None assert page.brozzle_count == 0 - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" worker.brozzle_site(browser, site) assert page.failed_attempts == 1 assert page.brozzle_count == 0 - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" worker.brozzle_site(browser, site) assert page.failed_attempts == 2 assert page.brozzle_count == 0 - assert site.status == 'ACTIVE' + assert site.status == "ACTIVE" worker.brozzle_site(browser, site) assert page.failed_attempts == 3 assert page.brozzle_count == 1 - assert site.status == 'FINISHED' - + assert site.status == "FINISHED" diff --git a/vagrant/vagrant-brozzler-new-job.py b/vagrant/vagrant-brozzler-new-job.py index c75d0756..b653b2b4 100755 --- a/vagrant/vagrant-brozzler-new-job.py +++ b/vagrant/vagrant-brozzler-new-job.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to queue a job for your vagrant brozzler deployment. @@ -20,30 +20,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import sys import os import argparse import subprocess + def main(argv=[]): arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) arg_parser.add_argument( - 'job_conf_file', metavar='JOB_CONF_FILE', - help='brozzler job configuration file in yaml') + "job_conf_file", + metavar="JOB_CONF_FILE", + help="brozzler job configuration file in yaml", + ) args = arg_parser.parse_args(args=argv[1:]) # cd to path with Vagrantfile so "vagrant ssh" knows what to do os.chdir(os.path.dirname(__file__)) - with open(args.job_conf_file, 'rb') as f: - subprocess.call([ - 'vagrant', 'ssh', '--', - 'f=`mktemp` && cat > $f && ' - '/home/vagrant/brozzler-ve3/bin/python ' - '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'], - stdin=f) - -if __name__ == '__main__': + with open(args.job_conf_file, "rb") as f: + subprocess.call( + [ + "vagrant", + "ssh", + "--", + "f=`mktemp` && cat > $f && " + "/home/vagrant/brozzler-ve3/bin/python " + "/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f", + ], + stdin=f, + ) + + +if __name__ == "__main__": main(sys.argv) diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index b0a0d800..244bf6b1 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' +""" vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to queue a site for your vagrant brozzler deployment. @@ -23,61 +23,69 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -''' +""" import sys import os import argparse import subprocess + try: from shlex import quote except: from pipes import quote + def main(argv=[]): arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) - arg_parser.add_argument('seed', metavar='SEED', help='seed url') - arg_parser.add_argument( - '--time-limit', dest='time_limit', default=None, - help='time limit in seconds for this site') + arg_parser.add_argument("seed", metavar="SEED", help="seed url") arg_parser.add_argument( - '--ignore-robots', dest='ignore_robots', action='store_true', - help='ignore robots.txt for this site') + "--time-limit", + dest="time_limit", + default=None, + help="time limit in seconds for this site", + ) arg_parser.add_argument( - '--warcprox-meta', dest='warcprox_meta', - help=( - 'Warcprox-Meta http request header to send with each request; ' - 'must be a json blob, ignored unless warcprox features are ' - 'enabled')) + "--ignore-robots", + dest="ignore_robots", + action="store_true", + help="ignore robots.txt for this site", + ) arg_parser.add_argument( - '-q', '--quiet', dest='quiet', action='store_true') - arg_parser.add_argument( - '-v', '--verbose', dest='verbose', action='store_true') + "--warcprox-meta", + dest="warcprox_meta", + help=( + "Warcprox-Meta http request header to send with each request; " + "must be a json blob, ignored unless warcprox features are " + "enabled" + ), + ) + arg_parser.add_argument("-q", "--quiet", dest="quiet", action="store_true") + arg_parser.add_argument("-v", "--verbose", dest="verbose", action="store_true") args = arg_parser.parse_args(args=argv[1:]) options = [] if args.time_limit: - options.append('--time-limit=%s' % args.time_limit) + options.append("--time-limit=%s" % args.time_limit) if args.ignore_robots: - options.append('--ignore-robots') + options.append("--ignore-robots") if args.warcprox_meta: # I think this shell escaping is correct? - options.append( - '--warcprox-meta=%s' % quote(args.warcprox_meta)) + options.append("--warcprox-meta=%s" % quote(args.warcprox_meta)) if args.quiet: - options.append('--quiet') + options.append("--quiet") if args.verbose: - options.append('--verbose') + options.append("--verbose") # cd to path with Vagrantfile so "vagrant ssh" knows what to do os.chdir(os.path.dirname(__file__)) cmd = ( - '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site ' - '%s %s') % (' '.join(options), args.seed) - subprocess.call(['vagrant', 'ssh', '--', cmd]) + "/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s" + ) % (" ".join(options), args.seed) + subprocess.call(["vagrant", "ssh", "--", cmd]) -if __name__ == '__main__': - main(sys.argv) +if __name__ == "__main__": + main(sys.argv)