Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update prometheus metrics #293

Merged
merged 4 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions brozzler/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@
from prometheus_client import Counter, Gauge, Histogram, start_http_server

# fmt: off
brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler")
brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler")
brozzler_page_processing_duration_seconds = Histogram("brozzler_page_processing_duration_seconds", "time spent processing a page in brozzler")
brozzler_in_progress_headers = Gauge("brozzler_in_progress_headers", "number of headers currently processing with brozzler")
brozzler_header_processing_duration_seconds = Histogram("brozzler_header_processing_duration_seconds", "time spent processing one page's headers in brozzler")
brozzler_in_progress_browses = Gauge("brozzler_in_progress_browses", "number of pages currently browsing with brozzler")
brozzler_browsing_duration_seconds = Histogram("brozzler_browsing_duration_seconds", "time spent browsing a page in brozzler")
brozzler_in_progress_ytdlps = Gauge("brozzler_in_progress_ytdlps", "number of ytdlp sessions currently in progress with brozzler")
brozzler_ytdlp_duration_seconds = Histogram("brozzler_ytdlp_duration_seconds", "time spent running ytdlp for a page in brozzler")
brozzler_pages_crawled = Counter("brozzler_pages_crawled", "number of pages visited by brozzler")
brozzler_outlinks_found = Counter("brozzler_outlinks_found", "number of outlinks found by brozzler")
brozzler_last_page_crawled_time = Gauge("brozzler_last_page_crawled_time", "time of last page visit, in seconds since UNIX epoch")
brozzler_in_progress_pages = Gauge("brozzler_in_progress_pages", "number of pages currently processing with brozzler")
brozzler_ydl_urls_checked = Counter("brozzler_ydl_urls_checked", "count of urls checked by brozzler yt-dlp")
brozzler_ydl_extract_attempts = Counter("brozzler_ydl_extract_attempts", "count of extracts attempted by brozzler yt-dlp", labelnames=["youtube_host"])
brozzler_ydl_extract_successes = Counter("brozzler_ydl_extract_successes", "count of extracts completed by brozzler yt-dlp", labelnames=["youtube_host"])
brozzler_ydl_download_successes = Counter("brozzler_ydl_download_successes", "count of downloads completed by brozzler yt-dlp", labelnames=["youtube_host"])
# fmt: on
Expand Down
8 changes: 6 additions & 2 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ def thumb_jpeg(self, full_jpeg):
img.save(out, "jpeg", quality=95)
return out.getbuffer()

@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()
def brozzle_page(
self,
browser,
Expand Down Expand Up @@ -315,6 +317,8 @@ def brozzle_page(
)
return outlinks

@metrics.brozzler_header_processing_duration_seconds.time()
@metrics.brozzler_in_progress_headers.track_inprogress()
def _get_page_headers(self, page):
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
Expand All @@ -334,8 +338,8 @@ def _needs_browsing(self, page_headers):
return False
return True

@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()
@metrics.brozzler_browsing_duration_seconds.time()
@metrics.brozzler_in_progress_browses.track_inprogress()
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def update_page_metrics(page, outlinks):
"""Update page-level Prometheus metrics."""
Expand Down
3 changes: 2 additions & 1 deletion brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,6 @@ def _try_youtube_dl(worker, ydl, site, page):
# ydl.extract_info(str(urlcanon.whatwg(ydl.url)), download=should_download_vid)
# if ydl.is_youtube_host and ie_result:
# download_url = ie_result.get("url")
metrics.brozzler_ydl_extract_attempts.labels(ydl.is_youtube_host).inc(1)
with brozzler.thread_accept_exceptions():
# we do whatwg canonicalization here to avoid "<urlopen error
# no host given>" resulting in ProxyError
Expand Down Expand Up @@ -406,6 +405,8 @@ def _try_youtube_dl(worker, ydl, site, page):
return ie_result


@metrics.brozzler_ytdlp_duration_seconds.time()
@metrics.brozzler_in_progress_ytdlps.track_inprogress()
def do_youtube_dl(worker, site, page):
"""
Runs yt-dlp configured for `worker` and `site` to download videos from
Expand Down