From 9508c5b671511f8d64d42f93ea6b70ebee644995 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 4 Nov 2024 14:32:22 -0800 Subject: [PATCH 1/3] Add a traitlet to disable recording HTTP request metrics Since this records a series of metrics for each HTTP handler class, this quickly leads to an explosion of cardinality and makes storing metrics quite difficult. For example, just accessing the metrics endpoint creates the following 17 metrics: ``` http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.005",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.01",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.025",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.05",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.075",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.1",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.25",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.5",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="0.75",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="1.0",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="2.5",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="5.0",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="7.5",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="10.0",method="GET",status_code="200"} 9.0 http_request_duration_seconds_bucket{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",le="+Inf",method="GET",status_code="200"} 9.0 http_request_duration_seconds_count{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",method="GET",status_code="200"} 9.0 http_request_duration_seconds_sum{handler="jupyter_server.base.handlers.PrometheusMetricsHandler",method="GET",status_code="200"} 0.009019851684570312 ``` This has what has stalled prior attempts at collecting metrics from jupyter_server usefully in multitenant deployments (see https://github.com/berkeley-dsep-infra/datahub/pull/1977). This PR adds a traitlet that allows hub admins to turn these metrics off. --- jupyter_server/log.py | 8 ++++++-- jupyter_server/serverapp.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/jupyter_server/log.py b/jupyter_server/log.py index aed024bb32..b1211114e9 100644 --- a/jupyter_server/log.py +++ b/jupyter_server/log.py @@ -41,13 +41,16 @@ def _scrub_uri(uri: str) -> str: return uri -def log_request(handler): +def log_request(handler, record_prometheus_metrics=True): """log a bit more information about each request than tornado's default - move static file get success to debug-level (reduces noise) - get proxied IP instead of proxy IP - log referer for redirect and failed requests - log user-agent for failed requests + + if record_prometheus_metrics is true, will record a histogram prometheus + metric (http_request_duration_seconds) for each request handler """ status = handler.get_status() request = handler.request @@ -97,4 +100,5 @@ def log_request(handler): headers[header] = request.headers[header] log_method(json.dumps(headers, indent=2)) log_method(msg.format(**ns)) - prometheus_log_method(handler) + if record_prometheus_metrics: + prometheus_log_method(handler) diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py index 2766509abc..2008b96800 100644 --- a/jupyter_server/serverapp.py +++ b/jupyter_server/serverapp.py @@ -4,6 +4,7 @@ # Distributed under the terms of the Modified BSD License. from __future__ import annotations +from functools import partial import datetime import errno import gettext @@ -410,7 +411,7 @@ def init_settings( settings = { # basics - "log_function": log_request, + "log_function": partial(log_request, record_prometheus_metrics=jupyter_app.record_http_request_metrics), "base_url": base_url, "default_url": default_url, "template_path": template_path, @@ -1993,6 +1994,18 @@ def _default_terminals_enabled(self) -> bool: config=True, ) + record_http_request_metrics = Bool( + True, + help=""" + REcord http_request_duration_seconds metric in the metrics endpoint. + + Since a histogram is exposed for each request handler, this can create a + *lot* of metrics, creating operational challenges for multitenant deployments. + + Set to False to disable recording the http_request_duration_seconds metric. + """ + ) + static_immutable_cache = List( Unicode(), help=""" From ba416b9686b0289c949ac63eeaf34465bfe9f17c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 22:37:27 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- jupyter_server/serverapp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py index 2008b96800..ce9a82cc26 100644 --- a/jupyter_server/serverapp.py +++ b/jupyter_server/serverapp.py @@ -4,7 +4,6 @@ # Distributed under the terms of the Modified BSD License. from __future__ import annotations -from functools import partial import datetime import errno import gettext @@ -29,6 +28,7 @@ import urllib import warnings from base64 import encodebytes +from functools import partial from pathlib import Path import jupyter_client @@ -411,7 +411,9 @@ def init_settings( settings = { # basics - "log_function": partial(log_request, record_prometheus_metrics=jupyter_app.record_http_request_metrics), + "log_function": partial( + log_request, record_prometheus_metrics=jupyter_app.record_http_request_metrics + ), "base_url": base_url, "default_url": default_url, "template_path": template_path, @@ -2003,7 +2005,7 @@ def _default_terminals_enabled(self) -> bool: *lot* of metrics, creating operational challenges for multitenant deployments. Set to False to disable recording the http_request_duration_seconds metric. - """ + """, ) static_immutable_cache = List( From a4cf5796a9f6093b7618a5b2badd75938958580f Mon Sep 17 00:00:00 2001 From: Yuvi Panda Date: Mon, 4 Nov 2024 14:42:30 -0800 Subject: [PATCH 3/3] Fix typo Co-authored-by: Zachary Sailer --- jupyter_server/serverapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py index ce9a82cc26..8dd9d02515 100644 --- a/jupyter_server/serverapp.py +++ b/jupyter_server/serverapp.py @@ -1999,7 +1999,7 @@ def _default_terminals_enabled(self) -> bool: record_http_request_metrics = Bool( True, help=""" - REcord http_request_duration_seconds metric in the metrics endpoint. + Record http_request_duration_seconds metric in the metrics endpoint. Since a histogram is exposed for each request handler, this can create a *lot* of metrics, creating operational challenges for multitenant deployments.