internetarchive · gretchenleighmiller · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 13, 2024
diff --git a/brozzler/cli.py b/brozzler/cli.py
@@ -598,27 +598,12 @@ def dump_state(signum, frame):
         finally:
             signal.signal(signal.SIGQUIT, dump_state)
 
-    def get_skip_av_seeds():
-        # TODO: develop UI and refactor
-        SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
-        try:
-            # make set from seed IDs in SKIP_AV_SEEDS_FILE
-            with open(SKIP_AV_SEEDS_FILE) as skips:
-                skip_av_seeds = {int(l) for l in skips.readlines()}
-                logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
-        except Exception as e:
-            skip_av_seeds = set()
-            logging.info("running with empty skip_av_seeds")
-        return skip_av_seeds
-
     rr = rethinker(args)
     frontier = brozzler.RethinkDbFrontier(rr)
     service_registry = doublethink.ServiceRegistry(rr)
-    skip_av_seeds_from_file = get_skip_av_seeds()
     worker = brozzler.worker.BrozzlerWorker(
         frontier,
         service_registry,
-        skip_av_seeds=skip_av_seeds_from_file,
         max_browsers=int(args.max_browsers),
         chrome_exe=args.chrome_exe,
         proxy=args.proxy,

diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml
@@ -95,8 +95,13 @@ seeds:
       password:
         type: string
 
+      video_capture:
+        type: string
+
       <<: *multi_level_options
 
 max_claimed_sites:
   type: integer
 
+pdfs_only:
+  type: boolean
diff --git a/brozzler/model.py b/brozzler/model.py
@@ -34,6 +34,7 @@
 import uuid
 import yaml
 import zlib
+from enum import Enum
 from typing import Optional
 
 
@@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
         job.id = job_conf["id"]
     if "max_claimed_sites" in job_conf:
         job.max_claimed_sites = job_conf["max_claimed_sites"]
+    if "pdfs_only" in job_conf:
+        job.pdfs_only = job_conf["pdfs_only"]
     job.save()
 
     sites = []
@@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
     def populate_defaults(self):
         if not "status" in self:
             self.status = "ACTIVE"
+        if "pdfs_only" not in self:
+            self.pdfs_only = False
         if not "starts_and_stops" in self:
             if self.get("started"):  # backward compatibility
                 self.starts_and_stops = [
@@ -220,33 +225,53 @@ def finish(self):
         self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
 
 
+class VideoCaptureOptions(Enum):
+    """
+    Enumeration of possible values for the `video_capture` config key.
+        - ENABLE_VIDEO_CAPTURE (default): All video is captured.
+        - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+          combination of the next two values.
+        - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
+          containing the word "video" is not captured.
+        - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+    Note: Ensuring full video MIME type blocking requires an additional entry in the
+          Warcprox-Meta header `mime-type-filters` key.
+    """
+
+    ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
+    DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
+    BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
+    DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
+
+
 class Site(doublethink.Document, ElapsedMixIn):
     logger = logging.getLogger(__module__ + "." + __qualname__)
     table = "sites"
 
     def populate_defaults(self):
-        if not "status" in self:
+        if "status" not in self:
             self.status = "ACTIVE"
-        if not "claimed" in self:
+        if "claimed" not in self:
             self.claimed = False
-        if not "last_disclaimed" in self:
+        if "last_disclaimed" not in self:
             self.last_disclaimed = brozzler.EPOCH_UTC
-        if not "last_claimed" in self:
+        if "last_claimed" not in self:
             self.last_claimed = brozzler.EPOCH_UTC
-        if not "scope" in self:
+        if "scope" not in self:
             self.scope = {}
-        if not "skip_ytdlp" in self:
-            self.skip_ytdlp = None
+        if "video_capture" not in self:
+            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
 
         # backward compatibility
         if "surt" in self.scope:
-            if not "accepts" in self.scope:
+            if "accepts" not in self.scope:
                 self.scope["accepts"] = []
             self.scope["accepts"].append({"surt": self.scope["surt"]})
             del self.scope["surt"]
 
         # backward compatibility
-        if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
+        if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
             self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
         if "max_hops_off_surt" in self.scope:
             del self.scope["max_hops_off_surt"]
@@ -256,7 +281,7 @@ def populate_defaults(self):
                 brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
             )
 
-        if not "starts_and_stops" in self:
+        if "starts_and_stops" not in self:
             if self.get("start_time"):  # backward compatibility
                 self.starts_and_stops = [
                     {"start": self.get("start_time"), "stop": None}
@@ -271,7 +296,7 @@ def __str__(self):
         return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
 
     def _accept_ssurt_if_not_redundant(self, ssurt):
-        if not "accepts" in self.scope:
+        if "accepts" not in self.scope:
             self.scope["accepts"] = []
         simple_rule_ssurts = (
             rule["ssurt"]

diff --git a/brozzler/worker.py b/brozzler/worker.py
@@ -21,6 +21,7 @@
 import logging
 import brozzler
 import brozzler.browser
+from brozzler.model import VideoCaptureOptions
 import datetime
 import threading
 import time
@@ -56,7 +57,6 @@ def __init__(
         self,
         frontier,
         service_registry=None,
-        skip_av_seeds=None,
         max_browsers=1,
         chrome_exe="chromium-browser",
         warcprox_auto=False,
@@ -80,7 +80,6 @@ def __init__(
     ):
         self._frontier = frontier
         self._service_registry = service_registry
-        self._skip_av_seeds = skip_av_seeds
         self._max_browsers = max_browsers
 
         self._warcprox_auto = warcprox_auto
@@ -272,7 +271,17 @@ def brozzle_page(
 
         if not self._needs_browsing(page_headers):
             self.logger.info("needs fetch: %s", page)
-            self._fetch_url(site, page=page)
+            if site.pdfs_only and not self._is_pdf(page_headers):
+                self.logger.info("skipping non-PDF content: PDFs only option enabled")
+            elif site.video_capture in [
+                VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+                VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
+            ] and self._is_video_type(page_headers):
+                self.logger.info(
+                    "skipping video content: video MIME type capture disabled for site"
+                )
+            else:
+                self._fetch_url(site, page=page)
         else:
             self.logger.info("needs browsing: %s", page)
             try:
@@ -286,9 +295,7 @@ def brozzle_page(
             except brozzler.PageInterstitialShown:
                 self.logger.info("page interstitial shown (http auth): %s", page)
 
-            if enable_youtube_dl and ydl.should_ytdlp(
-                site, page, status_code, self._skip_av_seeds
-            ):
+            if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
                 try:
                     ydl_outlinks = ydl.do_youtube_dl(self, site, page)
                     metrics.brozzler_ydl_urls_checked.inc(1)
@@ -336,13 +343,29 @@ def _get_page_headers(self, page):
             self.logger.warning("Failed to get headers for %s: %s", page.url, e)
             return {}
 
-    def _needs_browsing(self, page_headers):
-        if (
+    def _needs_browsing(self, page_headers) -> bool:
+        return not bool(
             "content-type" in page_headers
             and "html" not in page_headers["content-type"]
-        ):
-            return False
-        return True
+        )
+
+    def _is_video_type(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it contains
+        a video.
+        """
+        return (
+            "content-type" in page_headers and "video" in page_headers["content-type"]
+        )
+
+    def _is_pdf(self, page_headers) -> bool:
+        """
+        Determines if the page's Content-Type header specifies that it is a PDF.
+        """
+        return (
+            "content-type" in page_headers
+            and "application/pdf" in page_headers["content-type"]
+        )
 
     @metrics.brozzler_browsing_duration_seconds.time()
     @metrics.brozzler_in_progress_browses.track_inprogress()

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
@@ -20,6 +20,7 @@
 import yt_dlp
 from yt_dlp.utils import match_filter_func
 import brozzler
+from brozzler.model import VideoCaptureOptions
 import urllib.request
 import tempfile
 import urlcanon
@@ -39,36 +40,24 @@
 YTDLP_WAIT = 10
 
 
-def should_ytdlp(site, page, page_status, skip_av_seeds):
+def should_ytdlp(site, page, page_status):
     # called only after we've passed needs_browsing() check
 
     if page_status != 200:
         logging.info("skipping ytdlp: non-200 page status %s", page_status)
         return False
-    if site.skip_ytdlp:
-        logging.info("skipping ytdlp: site marked skip_ytdlp")
+    if site.video_capture in [
+        VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
+        VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
+    ]:
+        logging.info("skipping ytdlp: site has video capture disabled")
         return False
 
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
 
     if "chrome-error:" in ytdlp_url:
         return False
 
-    ytdlp_seed = (
-        site["metadata"]["ait_seed_id"]
-        if "metadata" in site and "ait_seed_id" in site["metadata"]
-        else None
-    )
-
-    # TODO: develop UI and refactor
-    if ytdlp_seed:
-        if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
-            logging.info("skipping ytdlp: site in skip_av_seeds")
-            site.skip_ytdlp = True
-            return False
-        else:
-            site.skip_ytdlp = False
-
     return True
 
 

diff --git a/job-conf.rst b/job-conf.rst
@@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
 simultaneously across the cluster. Addresses the problem of a job with many
 seeds starving out other jobs.
 
+``pdfs_only``
+~~~~~~~~~~~~~~~~~~~~~
++---------+----------+-----------+
+| type    | required | default   |
++=========+==========+===========+
+| boolean | no       | ``false`` |
++---------+----------+-----------+
+Limits capture to PDFs based on the MIME type set in the HTTP response's
+Content-Type header. This value only impacts processing of outlinks within
+Brozzler.
+
+*Note: Ensuring comprehensive limiting to only PDFs requires an additional
+entry in the Warcprox-Meta header `mime-type-filters` key.*
+
 ``seeds``
 ~~~~~~~~~
 +------------------------+----------+---------+
@@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
 the default values in place. Brozzler submits login forms after page load.
 Then brozzling proceeds as usual.
 
+``video_capture``
+~~~~~~~~~~~~~~~~~
++--------+----------+--------------------------+
+| type   | required | default                  |
++========+==========+==========================+
+| string | yes      | ``ENABLE_VIDEO_CAPTURE`` |
++--------+----------+--------------------------+
+Determines the level of video capture for the seed. This is an enumeration with four possible values:
+
+* ENABLE_VIDEO_CAPTURE (default): All video is captured.
+* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
+  combination of the next two values.
+* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
+  the word "video" is not captured.
+* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
+
+*Note: Ensuring full video MIME type blocking requires an additional entry in
+the Warcprox-Meta header `mime-type-filters` key.*
+
 Seed-level / top-level settings
 -------------------------------
 These are seed settings that can also be specified at the top level, in which

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,7 @@ maintainers = [
   { name="Adam Miller", email="[email protected]" },
   { name="Barbara Miller", email="[email protected]" },
   { name="Alex Dempsey", email="[email protected]" },
+  { name="Gretchen Leigh Miller", email="[email protected]" },
 ]
 description = "Distributed web crawling with browsers"
 readme = "README.rst"

diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@ def find_package_data(package):
 
 setuptools.setup(
     name="brozzler",
-    version="1.6.5",
+    version="1.7.0",
     description="Distributed web crawling with browsers",
     url="https://github.com/internetarchive/brozzler",
     author="Noah Levitt",