Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Seed-level video capture setting handling + Job-level PDF-only option #288

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,27 +598,12 @@ def dump_state(signum, frame):
finally:
signal.signal(signal.SIGQUIT, dump_state)

def get_skip_av_seeds():
galgeek marked this conversation as resolved.
Show resolved Hide resolved
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
5 changes: 5 additions & 0 deletions brozzler/job_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,13 @@ seeds:
password:
type: string

video_capture:
type: string

<<: *multi_level_options

max_claimed_sites:
type: integer

pdfs_only:
type: boolean
47 changes: 36 additions & 11 deletions brozzler/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import uuid
import yaml
import zlib
from enum import Enum
from typing import Optional


Expand Down Expand Up @@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()

sites = []
Expand Down Expand Up @@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if not "status" in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
Expand All @@ -220,33 +225,53 @@ def finish(self):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()


class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""

ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"


class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "sites"

def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if not "last_disclaimed" in self:
if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
if "scope" not in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = None
if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value

# backward compatibility
if "surt" in self.scope:
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"]

# backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
Expand All @@ -256,7 +281,7 @@ def populate_defaults(self):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)

if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
Expand All @@ -271,7 +296,7 @@ def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)

def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"]
Expand Down
45 changes: 34 additions & 11 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
import datetime
import threading
import time
Expand Down Expand Up @@ -56,7 +57,6 @@ def __init__(
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -80,7 +80,6 @@ def __init__(
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -272,7 +271,17 @@ def brozzle_page(

if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
if site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
elif site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video MIME type capture disabled for site"
)
else:
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page)
try:
Expand All @@ -286,9 +295,7 @@ def brozzle_page(
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(
site, page, status_code, self._skip_av_seeds
):
if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
metrics.brozzler_ydl_urls_checked.inc(1)
Expand Down Expand Up @@ -336,13 +343,29 @@ def _get_page_headers(self, page):
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}

def _needs_browsing(self, page_headers):
if (
def _needs_browsing(self, page_headers) -> bool:
return not bool(
"content-type" in page_headers
and "html" not in page_headers["content-type"]
):
return False
return True
)

def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)

def _is_pdf(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)

@metrics.brozzler_browsing_duration_seconds.time()
@metrics.brozzler_in_progress_browses.track_inprogress()
Expand Down
25 changes: 7 additions & 18 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yt_dlp
from yt_dlp.utils import match_filter_func
import brozzler
from brozzler.model import VideoCaptureOptions
import urllib.request
import tempfile
import urlcanon
Expand All @@ -39,36 +40,24 @@
YTDLP_WAIT = 10


def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check

if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False

ytdlp_url = page.redirect_url if page.redirect_url else page.url

if "chrome-error:" in ytdlp_url:
return False

ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)

# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False

return True


Expand Down
33 changes: 33 additions & 0 deletions job-conf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.

``pdfs_only``
galgeek marked this conversation as resolved.
Show resolved Hide resolved
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on the MIME type set in the HTTP response's
Content-Type header. This value only impacts processing of outlinks within
Brozzler.

*Note: Ensuring comprehensive limiting to only PDFs requires an additional
entry in the Warcprox-Meta header `mime-type-filters` key.*

``seeds``
~~~~~~~~~
+------------------------+----------+---------+
Expand Down Expand Up @@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.

``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:

* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*

Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ maintainers = [
{ name="Adam Miller", email="[email protected]" },
{ name="Barbara Miller", email="[email protected]" },
{ name="Alex Dempsey", email="[email protected]" },
{ name="Gretchen Leigh Miller", email="[email protected]" },
]
description = "Distributed web crawling with browsers"
readme = "README.rst"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def find_package_data(package):

setuptools.setup(
name="brozzler",
version="1.6.5",
version="1.7.0",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",
Expand Down
Loading