Skip to content

Commit

Permalink
Merge pull request #315 from galgeek/bmiller/proxy_select
Browse files Browse the repository at this point in the history
yt-dlp proxy handling update
  • Loading branch information
galgeek authored Jan 23, 2025
2 parents 1e30b4f + 9e09782 commit 5e701e9
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 11 deletions.
26 changes: 25 additions & 1 deletion brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2024 Internet Archive
Copyright (C) 2014-2025 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -546,6 +546,12 @@ def brozzler_worker(argv=None):
default="/tmp",
help="argparse.SUPPRESS",
)
arg_parser.add_argument(
"--ytdlp_proxy_file",
dest="ytdlp_proxy_file",
default="/opt/local/brozzler/ytdlp_proxy_endpoints.txt",
help="argparse.SUPPRESS",
)
arg_parser.add_argument(
"--stealth",
dest="stealth",
Expand Down Expand Up @@ -611,14 +617,32 @@ def get_skip_av_seeds():
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

def get_ytdlp_proxy_endpoints():
YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file
try:
# make list from file
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
if ytdlp_proxy_endpoints:
logging.info(
"running with ytdlp proxy endpoints file %s"
% YTDLP_PROXY_ENDPOINTS_FILE
)
except Exception as e:
ytdlp_proxy_endpoints = []
logging.info("running with empty proxy endpoints file")
return ytdlp_proxy_endpoints

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
8 changes: 6 additions & 2 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier
Copyright (C) 2014-2024 Internet Archive
Copyright (C) 2014-2025 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -57,6 +57,7 @@ def __init__(
frontier,
service_registry=None,
skip_av_seeds=None,
ytdlp_proxy_endpoints=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -81,6 +82,7 @@ def __init__(
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -290,7 +292,9 @@ def brozzle_page(
site, page, status_code, self._skip_av_seeds
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
ydl_outlinks = ydl.do_youtube_dl(
self, site, page, self._ytdlp_proxy_endpoints
)
metrics.brozzler_ydl_urls_checked.inc(1)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
Expand Down
16 changes: 8 additions & 8 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
Copyright (C) 2024 Internet Archive
Copyright (C) 2024-2025 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -28,13 +28,13 @@
import doublethink
import datetime
from . import metrics
import random
import threading
import time

thread_local = threading.local()


YTDLP_PROXY = ""
PROXY_ATTEMPTS = 4
YTDLP_WAIT = 10
YTDLP_MAX_REDIRECTS = 5
Expand Down Expand Up @@ -91,7 +91,7 @@ def _http_request(self, req):
return req


def _build_youtube_dl(worker, destdir, site, page):
def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
"""
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
Expand Down Expand Up @@ -308,11 +308,11 @@ def ydl_postprocess_hook(d):

ytdlp_url = page.redirect_url if page.redirect_url else page.url
is_youtube_host = isyoutubehost(ytdlp_url)
if is_youtube_host and YTDLP_PROXY:
ydl_opts["proxy"] = YTDLP_PROXY
if is_youtube_host and ytdlp_proxy_endpoints:
ydl_opts["proxy"] = random.choice(ytdlp_proxy_endpoints)
# don't log proxy value secrets
ytdlp_proxy_for_logs = (
YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@"
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
)
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)

Expand Down Expand Up @@ -436,7 +436,7 @@ def _try_youtube_dl(worker, ydl, site, page):

@metrics.brozzler_ytdlp_duration_seconds.time()
@metrics.brozzler_in_progress_ytdlps.track_inprogress()
def do_youtube_dl(worker, site, page):
def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
"""
Runs yt-dlp configured for `worker` and `site` to download videos from
`page`.
Expand All @@ -453,7 +453,7 @@ def do_youtube_dl(worker, site, page):
prefix="brzl-ydl-", dir=worker._ytdlp_tmpdir
) as tempdir:
logging.info("tempdir for yt-dlp: %s", tempdir)
ydl = _build_youtube_dl(worker, tempdir, site, page)
ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints)
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = set()
if ie_result and (
Expand Down

0 comments on commit 5e701e9

Please sign in to comment.