From 170377fe899be3fd49e765a83afa87664569f994 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 23 Jan 2025 10:58:32 -0800 Subject: [PATCH 1/5] yt-dlp proxy handling update --- brozzler/cli.py | 17 ++++++++++++++++- brozzler/worker.py | 4 +++- brozzler/ydl.py | 9 ++++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 32d08b2b..dc1f6114 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ """ brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2024 Internet Archive +Copyright (C) 2014-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -611,14 +611,29 @@ def get_skip_av_seeds(): logging.info("running with empty skip_av_seeds") return skip_av_seeds + def get_proxy_endpoints(): + PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt" + try: + # make list from file + with open(PROXY_ENDPOINTS_FILE) as endpoints: + proxy_endpoints = [l for l in endpoints.readlines()] + if proxy_endpoints: + logging.info("running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE) + except Exception as e: + proxy_endpoints = [] + logging.info("running with empty proxy endpoints file") + return proxy_endpoints + rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) skip_av_seeds_from_file = get_skip_av_seeds() + proxy_endpoints_from_file = get_proxy_endpoints() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, skip_av_seeds=skip_av_seeds_from_file, + proxy_endpoints=proxy_endpoints_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 4abfcd55..74286190 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -57,6 +57,7 @@ def __init__( frontier, service_registry=None, skip_av_seeds=None, + proxy_endpoints=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -81,6 +82,7 @@ def __init__( self._frontier = frontier self._service_registry = service_registry self._skip_av_seeds = skip_av_seeds + self._proxy_endpoints = proxy_endpoints self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -287,7 +289,7 @@ def brozzle_page( self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds + site, page, status_code, self._skip_av_seeds, self._proxy_endpoints ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 861b6e68..f2b06969 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,12 +34,11 @@ thread_local = threading.local() -YTDLP_PROXY = "" PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 -def should_ytdlp(site, page, page_status, skip_av_seeds): +def should_ytdlp(site, page, page_status, skip_av_seeds, proxy_endpoints): # called only after we've passed needs_browsing() check if page_status != 200: @@ -285,11 +284,11 @@ def ydl_postprocess_hook(d): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and YTDLP_PROXY: - ydl_opts["proxy"] = YTDLP_PROXY + if is_youtube_host and proxy_endpoints: + ydl_opts["proxy"] = random.choice(proxy_endpoints) # don't log proxy value secrets ytdlp_proxy_for_logs = ( - YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@" + ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@" ) logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs) From 854970f4dda7a5012866554c1df984bca9968d01 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 23 Jan 2025 11:21:05 -0800 Subject: [PATCH 2/5] black'd --- brozzler/cli.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index dc1f6114..59025e33 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -611,14 +611,16 @@ def get_skip_av_seeds(): logging.info("running with empty skip_av_seeds") return skip_av_seeds - def get_proxy_endpoints(): + def get_proxy_endpoints(): PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt" try: # make list from file with open(PROXY_ENDPOINTS_FILE) as endpoints: proxy_endpoints = [l for l in endpoints.readlines()] if proxy_endpoints: - logging.info("running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE) + logging.info( + "running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE + ) except Exception as e: proxy_endpoints = [] logging.info("running with empty proxy endpoints file") From baa33e3079dd68af81279d074786222ab9372aef Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 23 Jan 2025 12:17:07 -0800 Subject: [PATCH 3/5] ytdlp_proxy --- brozzler/cli.py | 20 ++++++++++---------- brozzler/worker.py | 10 +++++----- brozzler/ydl.py | 15 ++++++++------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 59025e33..16d2e76f 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -611,31 +611,31 @@ def get_skip_av_seeds(): logging.info("running with empty skip_av_seeds") return skip_av_seeds - def get_proxy_endpoints(): - PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt" + def get_ytdlp_proxy_endpoints(): + YTDLP_PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/ytdlp_proxy_endpoints.txt" try: # make list from file - with open(PROXY_ENDPOINTS_FILE) as endpoints: - proxy_endpoints = [l for l in endpoints.readlines()] - if proxy_endpoints: + with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: + ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] + if ytdlp_proxy_endpoints: logging.info( - "running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE + "running with ytdlp proxy endpoints file %s" % YTDLP_PROXY_ENDPOINTS_FILE ) except Exception as e: - proxy_endpoints = [] + ytdlp_proxy_endpoints = [] logging.info("running with empty proxy endpoints file") - return proxy_endpoints + return ytdlp_proxy_endpoints rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) skip_av_seeds_from_file = get_skip_av_seeds() - proxy_endpoints_from_file = get_proxy_endpoints() + ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints() worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, skip_av_seeds=skip_av_seeds_from_file, - proxy_endpoints=proxy_endpoints_from_file, + ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, diff --git a/brozzler/worker.py b/brozzler/worker.py index 74286190..2d8bca98 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ it runs yt-dlp on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2024 Internet Archive +Copyright (C) 2014-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ def __init__( frontier, service_registry=None, skip_av_seeds=None, - proxy_endpoints=None, + ytdlp_proxy_endpoints=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, @@ -82,7 +82,7 @@ def __init__( self._frontier = frontier self._service_registry = service_registry self._skip_av_seeds = skip_av_seeds - self._proxy_endpoints = proxy_endpoints + self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints self._max_browsers = max_browsers self._warcprox_auto = warcprox_auto @@ -289,10 +289,10 @@ def brozzle_page( self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds, self._proxy_endpoints + site, page, status_code, self._skip_av_seeds ): try: - ydl_outlinks = ydl.do_youtube_dl(self, site, page) + ydl_outlinks = ydl.do_youtube_dl(self, site, page, self._ytdlp_proxy_endpoints) metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f2b06969..be7e490f 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ """ brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2024 Internet Archive +Copyright (C) 2024-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ import doublethink import datetime from . import metrics +import random import threading import time @@ -38,7 +39,7 @@ YTDLP_WAIT = 10 -def should_ytdlp(site, page, page_status, skip_av_seeds, proxy_endpoints): +def should_ytdlp(site, page, page_status, skip_av_seeds): # called only after we've passed needs_browsing() check if page_status != 200: @@ -89,7 +90,7 @@ def _http_request(self, req): return req -def _build_youtube_dl(worker, destdir, site, page): +def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. @@ -284,8 +285,8 @@ def ydl_postprocess_hook(d): ytdlp_url = page.redirect_url if page.redirect_url else page.url is_youtube_host = isyoutubehost(ytdlp_url) - if is_youtube_host and proxy_endpoints: - ydl_opts["proxy"] = random.choice(proxy_endpoints) + if is_youtube_host and ytdlp_proxy_endpoints: + ydl_opts["proxy"] = random.choice(ytdlp_proxy_endpoints) # don't log proxy value secrets ytdlp_proxy_for_logs = ( ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@" @@ -408,7 +409,7 @@ def _try_youtube_dl(worker, ydl, site, page): @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() -def do_youtube_dl(worker, site, page): +def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): """ Runs yt-dlp configured for `worker` and `site` to download videos from `page`. @@ -425,7 +426,7 @@ def do_youtube_dl(worker, site, page): prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir ) as tempdir: logging.info("tempdir for yt-dlp: %s", tempdir) - ydl = _build_youtube_dl(worker, tempdir, site, page) + ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() if ie_result and ( From b22349e281846286965e9696de1197c3ae27a0f1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 23 Jan 2025 12:37:56 -0800 Subject: [PATCH 4/5] black'd --- brozzler/cli.py | 3 ++- brozzler/worker.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 16d2e76f..51c7471a 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -619,7 +619,8 @@ def get_ytdlp_proxy_endpoints(): ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] if ytdlp_proxy_endpoints: logging.info( - "running with ytdlp proxy endpoints file %s" % YTDLP_PROXY_ENDPOINTS_FILE + "running with ytdlp proxy endpoints file %s" + % YTDLP_PROXY_ENDPOINTS_FILE ) except Exception as e: ytdlp_proxy_endpoints = [] diff --git a/brozzler/worker.py b/brozzler/worker.py index 2d8bca98..ce1eb071 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -292,7 +292,9 @@ def brozzle_page( site, page, status_code, self._skip_av_seeds ): try: - ydl_outlinks = ydl.do_youtube_dl(self, site, page, self._ytdlp_proxy_endpoints) + ydl_outlinks = ydl.do_youtube_dl( + self, site, page, self._ytdlp_proxy_endpoints + ) metrics.brozzler_ydl_urls_checked.inc(1) outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: From 9e0978298467c016b7061bd6791cb986109cdd08 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 23 Jan 2025 14:35:34 -0800 Subject: [PATCH 5/5] ytdlp_proxy_file param --- brozzler/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 51c7471a..ab73602e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -546,6 +546,12 @@ def brozzler_worker(argv=None): default="/tmp", help="argparse.SUPPRESS", ) + arg_parser.add_argument( + "--ytdlp_proxy_file", + dest="ytdlp_proxy_file", + default="/opt/local/brozzler/ytdlp_proxy_endpoints.txt", + help="argparse.SUPPRESS", + ) arg_parser.add_argument( "--stealth", dest="stealth", @@ -612,7 +618,7 @@ def get_skip_av_seeds(): return skip_av_seeds def get_ytdlp_proxy_endpoints(): - YTDLP_PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/ytdlp_proxy_endpoints.txt" + YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file try: # make list from file with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: