From 19ac66bdd8f776d686eab8bde14df9ae32707a2d Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sat, 6 Apr 2024 20:58:33 +0200 Subject: [PATCH] migrate: jupyterhub.jsonnet and .libsonnet --- dashboards/common.libsonnet | 11 + dashboards/jupyterhub.jsonnet | 1029 ++++++++++++++++--------------- dashboards/jupyterhub.libsonnet | 120 ++-- 3 files changed, 617 insertions(+), 543 deletions(-) diff --git a/dashboards/common.libsonnet b/dashboards/common.libsonnet index cbf49bd..970951b 100644 --- a/dashboards/common.libsonnet +++ b/dashboards/common.libsonnet @@ -2,6 +2,8 @@ local grafonnet = import 'grafonnet/main.libsonnet'; local ts = grafonnet.panel.timeSeries; local barChart = grafonnet.panel.barChart; local barGauge = grafonnet.panel.barGauge; +local heatmap = grafonnet.panel.heatmap; +local table = grafonnet.panel.table; local var = grafonnet.dashboard.variable; { @@ -32,6 +34,15 @@ local var = grafonnet.dashboard.variable; barGaugeOptions: barGauge.standardOptions.withMin(0), + // grafana ref: https://grafana.com/docs/grafana/v10.4/panels-visualizations/visualizations/heatmap/ + // grafonnet ref: https://grafana.github.io/grafonnet/API/panel/heatmap/index.html + heatmapOptions: + heatmap.standardOptions.withMin(0), + + tableOptions: + table.standardOptions.withMin(0), + + variables: { prometheus: var.datasource.new('PROMETHEUS_DS', 'prometheus'), diff --git a/dashboards/jupyterhub.jsonnet b/dashboards/jupyterhub.jsonnet index 62c4b38..1abcf52 100755 --- a/dashboards/jupyterhub.jsonnet +++ b/dashboards/jupyterhub.jsonnet @@ -3,532 +3,593 @@ // with useful stats about usage & diagnostics. local grafonnet = import 'grafonnet/main.libsonnet'; local dashboard = grafonnet.dashboard; -local singlestat = grafonnet.singlestat; -local graphPanel = grafonnet.graphPanel; -local prometheus = grafonnet.prometheus; -local tablePanel = grafonnet.tablePanel; -local row = grafonnet.row; -local heatmapPanel = grafonnet.heatmapPanel; +local ts = grafonnet.panel.timeSeries; +local prometheus = grafonnet.query.prometheus; +local table = grafonnet.panel.table; +local heatmap = grafonnet.panel.heatmap; +local row = grafonnet.panel.row; local common = import './common.libsonnet'; local jupyterhub = import 'jupyterhub.libsonnet'; local standardDims = jupyterhub.standardDims; // Hub usage stats -local currentActiveUsers = graphPanel.new( - 'Currently Active Users', - decimals=0, - stack=true, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( +local currentActiveUsers = + common.tsOptions + + ts.new('Currently Active Users') + + ts.panelOptions.withDescription( ||| - sum( - group( - kube_pod_status_phase{phase="Running"} - ) by (label_component, pod, namespace) - %s - ) by (namespace) - ||| % jupyterhub.onComponentLabel('singleuser-server', group_right=''), - legendFormat='{{namespace}}', - ), -]); - -local dailyActiveUsers = graphPanel.new( - 'Daily Active Users', - description=||| - Number of unique users who were active within the preceeding 24h period. - - Requires JupyterHub 3.1. - |||, - legend_hideZero=false, - decimals=0, - stack=true, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + TODO ||| - max( - jupyterhub_active_users{period="24h", namespace=~"$hub"} - ) by (namespace) - |||, - legendFormat='{{namespace}}', - ), -]); - -local weeklyActiveUsers = graphPanel.new( - 'Weekly Active Users', - description=||| - Number of unique users who were active within the preceeding 7d period. - - Requires JupyterHub 3.1. - |||, - legend_hideZero=false, - decimals=0, - stack=true, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + ) + + ts.standardOptions.withDecimals(0) + // stack=true, + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + group( + kube_pod_status_phase{phase="Running"} + ) by (label_component, pod, namespace) + %s + ) by (namespace) + ||| + % jupyterhub.onComponentLabel('singleuser-server', group_right=''), + ) + + prometheus.withLegendFormat('{{ namespace }}'), + ]); + +local dailyActiveUsers = + common.tsOptions + + ts.new('Daily Active Users') + + ts.panelOptions.withDescription( ||| - max( - jupyterhub_active_users{period="7d", namespace=~"$hub"} - ) by (namespace) + Number of unique users who were active within the preceding 24h period. + + Requires JupyterHub 3.1. |||, - legendFormat='{{namespace}}', - ), -]); - -local monthlyActiveUsers = graphPanel.new( - 'Monthly Active Users', - description=||| - Number of unique users who were active within the preceeding 7d period. - - Requires JupyterHub 3.1. - |||, - legend_hideZero=false, - decimals=0, - stack=true, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + ) + // legend_hideZero=false, + + ts.standardOptions.withDecimals(0) + // stack=true, + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + max( + jupyterhub_active_users{period="24h", namespace=~"$hub"} + ) by (namespace) + ||| + ) + + prometheus.withLegendFormat('{{ namespace }}'), + ]); + +local weeklyActiveUsers = + common.tsOptions + + ts.new('Weekly Active Users') + + ts.panelOptions.withDescription( ||| - max( - jupyterhub_active_users{period="30d", namespace=~"$hub"} - ) by (namespace) - |||, - legendFormat='{{namespace}}', - ), -]); + Number of unique users who were active within the preceeding 7d period. -local userMemoryDistribution = heatmapPanel.new( - 'User memory usage distribution', - // xBucketSize and interval must match to get correct values out of heatmaps - xBucketSize='600s', - yAxis_format='bytes', - yAxis_min=0, - color_colorScheme='interpolateViridis', - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( - ||| - sum( - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() by (pod) reports double the actual metric - container_memory_working_set_bytes{name!=""} - %s - ) by (pod) - ||| % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), - interval='600s', - intervalFactor=1, - ), -]); - -local userCPUDistribution = heatmapPanel.new( - 'User CPU usage distribution', - // xBucketSize and interval must match to get correct values out of heatmaps - xBucketSize='600s', - yAxis_format='percentunit', - yAxis_min=0, - color_colorScheme='interpolateViridis', - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + Requires JupyterHub 3.1. ||| - sum( - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() by (pod) reports double the actual metric - irate(container_cpu_usage_seconds_total{name!=""}[5m]) - %s - ) by (pod) - ||| % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), - interval='600s', - intervalFactor=1, - ), -]); - -local userAgeDistribution = heatmapPanel.new( - 'User active age distribution', - // xBucketSize and interval must match to get correct values out of heatmaps - xBucketSize='600s', - yAxis_format='s', - yAxis_min=0, - color_colorScheme='interpolateViridis', - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + ) + // legend_hideZero=false, + + ts.standardOptions.withDecimals(0) + // stack=true, + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + max( + jupyterhub_active_users{period="7d", namespace=~"$hub"} + ) by (namespace) + ||| + ) + + prometheus.withLegendFormat('{{ namespace }}'), + ]); + +local monthlyActiveUsers = + common.tsOptions + + ts.new('Monthly Active Users') + + ts.panelOptions.withDescription( ||| - ( - time() - - ( - kube_pod_created - %s - ) - ) - ||| % jupyterhub.onComponentLabel('singleuser-server'), - interval='600s', - intervalFactor=1, - ), -]); + Number of unique users who were active within the preceeding 7d period. -// Hub diagnostics -local hubResponseLatency = graphPanel.new( - 'Hub response latency', - formatY1='s', - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + Requires JupyterHub 3.1. ||| - histogram_quantile( - 0.99, + ) + // legend_hideZero=false, + + ts.standardOptions.withDecimals(0) + // stack=true, + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + max( + jupyterhub_active_users{period="30d", namespace=~"$hub"} + ) by (namespace) + |||, + ) + + prometheus.withLegendFormat('{{ namespace }}'), + ]); + +local userMemoryDistribution = + common.heatmapOptions + + heatmap.new('User memory usage distribution') + // xBucketSize and interval must match to get correct values out of heatmaps + // xBucketSize='600s', + // yAxis_format='bytes', + // yAxis_min=0, + // color_colorScheme='interpolateViridis', + + heatmap.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| sum( - rate( - jupyterhub_request_duration_seconds_bucket{ - app="jupyterhub", - namespace=~"$hub", - # Ignore SpawnProgressAPIHandler, as it is a EventSource stream - # and keeps long lived connections open - handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" - }[5m] - ) - ) by (le)) - |||, - legendFormat='99th percentile' - ), - prometheus.target( - ||| - histogram_quantile( - 0.50, + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() by (pod) reports double the actual metric + container_memory_working_set_bytes{name!=""} + %s + ) by (pod) + ||| + % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), + ), + // interval='600s', + // intervalFactor=1, + ]); + +local userCPUDistribution = + common.heatmapOptions + + heatmap.new('User CPU usage distribution') + // xBucketSize and interval must match to get correct values out of heatmaps + // xBucketSize='600s', + // yAxis_format='percentunit', + // yAxis_min=0, + // color_colorScheme='interpolateViridis', + + heatmap.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| sum( - rate( - jupyterhub_request_duration_seconds_bucket{ - app="jupyterhub", - namespace=~"$hub", - # Ignore SpawnProgressAPIHandler, as it is a EventSource stream - # and keeps long lived connections open - handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" - }[5m] + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() by (pod) reports double the actual metric + irate(container_cpu_usage_seconds_total{name!=""}[5m]) + %s + ) by (pod) + ||| + % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), + ), + // interval='600s', + // intervalFactor=1, + ]); + +local userAgeDistribution = + common.heatmapOptions + + heatmap.new('User active age distribution') + // xBucketSize and interval must match to get correct values out of heatmaps + // xBucketSize='600s', + // yAxis_format='s', + // yAxis_min=0, + // color_colorScheme='interpolateViridis', + + heatmap.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + ( + time() + - ( + kube_pod_created + %s ) - ) by (le)) - |||, - legendFormat='50th percentile' - ), - prometheus.target( - ||| - histogram_quantile( - 0.25, + ) + ||| + % jupyterhub.onComponentLabel('singleuser-server'), + ), + // interval='600s', + // intervalFactor=1, + ]); + +// Hub diagnostics +local hubResponseLatency = + common.tsOptions + + ts.new('Hub response latency') + // formatY1='s', + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + histogram_quantile( + 0.99, + sum( + rate( + jupyterhub_request_duration_seconds_bucket{ + app="jupyterhub", + namespace=~"$hub", + # Ignore SpawnProgressAPIHandler, as it is a EventSource stream + # and keeps long lived connections open + handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" + }[5m] + ) + ) by (le)) + |||, + ) + + prometheus.withLegendFormat('99th percentile'), + prometheus.new( + '$PROMETHEUS_DS', + ||| + histogram_quantile( + 0.50, + sum( + rate( + jupyterhub_request_duration_seconds_bucket{ + app="jupyterhub", + namespace=~"$hub", + # Ignore SpawnProgressAPIHandler, as it is a EventSource stream + # and keeps long lived connections open + handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" + }[5m] + ) + ) by (le)) + |||, + ) + + prometheus.withLegendFormat('50th percentile'), + prometheus.new( + '$PROMETHEUS_DS', + ||| + histogram_quantile( + 0.25, + sum( + rate( + jupyterhub_request_duration_seconds_bucket{ + app="jupyterhub", + namespace=~"$hub", + # Ignore SpawnProgressAPIHandler, as it is a EventSource stream + # and keeps long lived connections open + handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" + }[5m] + ) + ) by (le)) + |||, + ) + + prometheus.withLegendFormat('25th percentile'), + ]); + +local hubResponseCodes = + common.tsOptions + + ts.new('Hub response status codes') + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| sum( - rate( + increase( jupyterhub_request_duration_seconds_bucket{ app="jupyterhub", namespace=~"$hub", - # Ignore SpawnProgressAPIHandler, as it is a EventSource stream - # and keeps long lived connections open - handler!="jupyterhub.apihandlers.users.SpawnProgressAPIHandler" - }[5m] + }[2m] ) - ) by (le)) - |||, - legendFormat='25th percentile' - ), -]); - -local hubResponseCodes = graphPanel.new( - 'Hub response status codes', - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( - ||| - sum( - increase( - jupyterhub_request_duration_seconds_bucket{ - app="jupyterhub", - namespace=~"$hub", - }[2m] - ) - ) by (code) - |||, - legendFormat='{{ code }}' - ), -]); + ) by (code) + ||| + ) + + prometheus.withLegendFormat('{{ code }}'), + ]); // with multi=true, component='singleuser-server' means all components *except* singleuser-server local allComponentsMemory = jupyterhub.memoryPanel('All JupyterHub Components', component='singleuser-server', multi=true); local allComponentsCPU = jupyterhub.cpuPanel('All JupyterHub Components', component='singleuser-server', multi=true); -local hubDBUsage = graphPanel.new( - 'Hub DB Disk Space Availability %', - description=||| - % of disk space left in the disk storing the JupyterHub sqlite database. If goes to 0, the hub will fail. - |||, - decimals=0, - min=0, - max=1, - formatY1='percentunit', - datasource='$PROMETHEUS_DS' -).addTarget( - prometheus.target( +local hubDBUsage = + common.tsOptions + + ts.new('Hub DB Disk Space Availability %') + + ts.panelOptions.withDescription( ||| - # Free bytes available on the hub db PVC - sum(kubelet_volume_stats_available_bytes{persistentvolumeclaim="hub-db-dir", namespace=~"$hub"}) by (namespace) / - # Total number of bytes available on the hub db PVC - sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="hub-db-dir", namespace=~"$hub"}) by (namespace) - |||, - legendFormat='{{ $hub }}' - ), -); - - -local serverStartTimes = graphPanel.new( - 'Server Start Times', - formatY1='s', - lines=false, - min=0, - points=true, - pointradius=2, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( - // Metrics from hub seems to have `namespace` rather than just `namespace` - 'histogram_quantile(0.99, sum(rate(jupyterhub_server_spawn_duration_seconds_bucket{app="jupyterhub", namespace=~"$hub"}[5m])) by (le))', - legendFormat='99th percentile' - ), - prometheus.target( - 'histogram_quantile(0.5, sum(rate(jupyterhub_server_spawn_duration_seconds_bucket{app="jupyterhub", namespace=~"$hub"}[5m])) by (le))', - legendFormat='50th percentile' - ), -]); - -local serverSpawnFailures = graphPanel.new( - 'Server Start Failures', - description=||| - Attempts by users to start servers that failed. - |||, - lines=false, - min=0, - points=false, - legend_hideZero=true, - bars=true, - pointradius=2, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( - 'sum(increase(jupyterhub_server_spawn_duration_seconds_count{status!="success"}[2m])) by (status)', - legendFormat='{{status}}' - ), -]); - -local usersPerNode = graphPanel.new( - 'Users per node', - decimals=0, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + % of disk space left in the disk storing the JupyterHub sqlite database. If goes to 0, the hub will fail. ||| - sum( - # kube_pod_info.node identifies the pod node, - # while kube_pod_labels.node is the metrics exporter's node - kube_pod_info{node!=""} - %s - ) by (node) - ||| % jupyterhub.onComponentLabel('singleuser-server', group_left=''), - legendFormat='{{ node }}' - ), -]); - - -local nonRunningPods = graphPanel.new( - 'Non Running Pods', - description=||| - Pods in a non-running state in the hub's namespace. - - Pods stuck in non-running states often indicate an error condition - |||, - decimalsY1=0, - min=0, - stack=true, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + ) + + ts.standardOptions.withDecimals(0) + + ts.standardOptions.withMin(0) + + ts.standardOptions.withMax(1) + // formatY1='percentunit', + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + # Free bytes available on the hub db PVC + sum(kubelet_volume_stats_available_bytes{persistentvolumeclaim="hub-db-dir", namespace=~"$hub"}) by (namespace) / + # Total number of bytes available on the hub db PVC + sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="hub-db-dir", namespace=~"$hub"}) by (namespace) + ||| + ) + + prometheus.withLegendFormat('{{ $hub }}'), + ]); + + +local serverStartTimes = + common.tsOptions + + ts.new('Server Start Times') + // formatY1='s', + // lines=false, + + ts.standardOptions.withMin(0) + // points=true, + // pointradius=2, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + // Metrics from hub seems to have `namespace` rather than just `namespace` + 'histogram_quantile(0.99, sum(rate(jupyterhub_server_spawn_duration_seconds_bucket{app="jupyterhub", namespace=~"$hub"}[5m])) by (le))', + ) + + prometheus.withLegendFormat('99th percentile'), + prometheus.new( + '$PROMETHEUS_DS', + 'histogram_quantile(0.5, sum(rate(jupyterhub_server_spawn_duration_seconds_bucket{app="jupyterhub", namespace=~"$hub"}[5m])) by (le))', + ) + + prometheus.withLegendFormat('50th percentile'), + ]); + +local serverSpawnFailures = + common.tsOptions + + ts.new('Server Start Failures') + + ts.panelOptions.withDescription( ||| - sum( - kube_pod_status_phase{phase!="Running", namespace=~"$hub"} - ) by (phase) - |||, - legendFormat='{{phase}}' - ), -]); - -local sharedVolumeFreeSpace = graphPanel.new( - 'Free space (%) in shared volume (Home directories, etc.)', - description=||| - % of disk space left in a shared storage volume, typically used for users' - home directories. - - Requires an additional node_exporter deployment to work. If this graph - is empty, look at the README for jupyterhub/grafana-dashboards to see - what extra deployment is needed. - |||, - decimalsY1=0, - min=0, - max=1, - formatY1='percentunit', - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + Attempts by users to start servers that failed. ||| - min( - node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=~"$hub"} - / - node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=~"$hub"} - ) by (namespace) - |||, - legendFormat='{{namespace}}' - ), -]); + ) + // lines=false, + + ts.standardOptions.withMin(0) + // points=false, + // legend_hideZero=true, + // bars=true, + // pointradius=2, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum(increase(jupyterhub_server_spawn_duration_seconds_count{status!="success"}[2m])) by (status) + ||| + ) + + prometheus.withLegendFormat('{{status}}'), + ]); + +local usersPerNode = + common.tsOptions + + ts.new('Users per node') + + ts.standardOptions.withDecimals(0) + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + # kube_pod_info.node identifies the pod node, + # while kube_pod_labels.node is the metrics exporter's node + kube_pod_info{node!=""} + %s + ) by (node) + ||| + % jupyterhub.onComponentLabel('singleuser-server', group_left='') + ) + + prometheus.withLegendFormat('{{ node }}'), + ]); + + +local nonRunningPods = + common.tsOptions + + ts.new('Non Running Pods') + + ts.panelOptions.withDescription( + ||| + Pods in a non-running state in the hub's namespace. + + Pods stuck in non-running states often indicate an error condition + ||| + ) + // decimalsY1=0, + + ts.standardOptions.withMin(0) + // stack=true, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum( + kube_pod_status_phase{phase!="Running", namespace=~"$hub"} + ) by (phase) + ||| + ) + + prometheus.withLegendFormat('{{phase}}'), + ]); + +local sharedVolumeFreeSpace = + common.tsOptions + + ts.new('Free space (%) in shared volume (Home directories, etc.)') + + ts.panelOptions.withDescription( + ||| + % of disk space left in a shared storage volume, typically used for users' + home directories. + + Requires an additional node_exporter deployment to work. If this graph + is empty, look at the README for jupyterhub/grafana-dashboards to see + what extra deployment is needed. + ||| + ) + // decimalsY1=0, + + ts.standardOptions.withMin(0) + + ts.standardOptions.withMax(1) + // formatY1='percentunit', + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + min( + node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=~"$hub"} + / + node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=~"$hub"} + ) by (namespace) + ||| + ) + + prometheus.withLegendFormat('{{ namespace }}'), + ]); // Anomalous tables -local oldUserpods = tablePanel.new( - 'Very old user pods', - description=||| - User pods that have been running for a long time (>8h). - - This often indicates problems with the idle culler - |||, - transform='timeseries_to_rows', - styles=[ - { - pattern: 'Value', - type: 'number', - unit: 's', - alias: 'Age', - }, - ], - sort={ - col: 2, - desc: true, - }, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( +local oldUserpods = + common.tableOptions + + table.new('Very old user pods') + + ts.panelOptions.withDescription( ||| - ( - time() - (kube_pod_created %s) - ) > (8 * 60 * 60) # 8 hours is our threshold - ||| % jupyterhub.onComponentLabel('singleuser-server'), - legendFormat='{{namespace}}/{{pod}}', - instant=true - ), -]).hideColumn('Time'); - -local highCPUUserPods = tablePanel.new( - 'User Pods with high CPU usage (>0.5)', - description=||| - User pods using a lot of CPU - - This could indicate a runaway process consuming resources - unnecessarily. - |||, - transform='timeseries_to_rows', - styles=[ - { - pattern: 'Value', - type: 'number', - unit: 'percentunit', - alias: 'CPU usage', - }, - ], - sort={ - col: 2, - desc: true, - }, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + User pods that have been running for a long time (>8h). + + This often indicates problems with the idle culler + ||| + ) + // transform='timeseries_to_rows', + // styles=[ + // { + // pattern: 'Value', + // type: 'number', + // unit: 's', + // alias: 'Age', + // }, + // ], + // sort={ + // col: 2, + // desc: true, + // }, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + ( + time() - (kube_pod_created %s) + ) > (8 * 60 * 60) # 8 hours is our threshold + ||| + % jupyterhub.onComponentLabel('singleuser-server') + ) + + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), + // instant=true + ]); +// .hideColumn('Time') + +local highCPUUserPods = + common.tableOptions + + table.new('User Pods with high CPU usage (>0.5)') + + ts.panelOptions.withDescription( ||| - max( # Ideally we just want 'current' value, so max will do - irate(container_cpu_usage_seconds_total[5m]) - %s - ) by (namespace, pod) > 0.5 - ||| % jupyterhub.onComponentLabel('singleuser-server', group_left=''), - legendFormat='{{namespace}}/{{pod}}', - instant=true - ), -]).hideColumn('Time'); - -local highMemoryUsagePods = tablePanel.new( - 'User pods with high memory usage (>80% of limit)', - description=||| - User pods getting close to their memory limit - - Once they hit their memory limit, user kernels will start dying. - |||, - transform='timeseries_to_rows', - styles=[ - { - pattern: 'Value', - type: 'number', - unit: 'percentunit', - alias: '% of mem limit consumed', - }, - ], - sort={ - col: 2, - desc: true, - }, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( + User pods using a lot of CPU + + This could indicate a runaway process consuming resources + unnecessarily. + ||| + ) + // transform='timeseries_to_rows', + // styles=[ + // { + // pattern: 'Value', + // type: 'number', + // unit: 'percentunit', + // alias: 'CPU usage', + // }, + // ], + // sort={ + // col: 2, + // desc: true, + // }, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + max( # Ideally we just want 'current' value, so max will do + irate(container_cpu_usage_seconds_total[5m]) + %s + ) by (namespace, pod) > 0.5 + ||| + % jupyterhub.onComponentLabel('singleuser-server', group_left='') + ) + + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), + // instant=true + ]); +// .hideColumn('Time') + +local highMemoryUsagePods = + common.tableOptions + + table.new('User pods with high memory usage (>80% of limit)') + + ts.panelOptions.withDescription( ||| - max( # Ideally we just want 'current', but max will do. This metric is a gauge, so sum is inappropriate - container_memory_working_set_bytes - %(selector)s - ) by (namespace, pod) - / - sum( - kube_pod_container_resource_limits_memory_bytes - %(selector)s - ) by (namespace, pod) - > 0.8 - ||| % { - selector: jupyterhub.onComponentLabel('singleuser-server', group_left=''), - }, - legendFormat='{{namespace}}/{{pod}}', - instant=true - ), -]).hideColumn('Time'); + User pods getting close to their memory limit + + Once they hit their memory limit, user kernels will start dying. + ||| + ) + // transform='timeseries_to_rows', + // styles=[ + // { + // pattern: 'Value', + // type: 'number', + // unit: 'percentunit', + // alias: '% of mem limit consumed', + // }, + // ], + // sort={ + // col: 2, + // desc: true, + // }, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + max( # Ideally we just want 'current', but max will do. This metric is a gauge, so sum is inappropriate + container_memory_working_set_bytes + %(selector)s + ) by (namespace, pod) + / + sum( + kube_pod_container_resource_limits_memory_bytes + %(selector)s + ) by (namespace, pod) + > 0.8 + ||| + % { + selector: jupyterhub.onComponentLabel('singleuser-server', group_left=''), + } + ) + + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), + // instant=true + ]); +// .hideColumn('Time') // Show images used by different users on the hub -local notebookImagesUsed = graphPanel.new( - 'Images used by user pods', - description=||| - Number of user servers using a container image. - |||, - legend_hideZero=false, - decimals=0, - stack=false, - min=0, - datasource='$PROMETHEUS_DS' -).addTargets([ - prometheus.target( +local notebookImagesUsed = + common.tsOptions + + ts.new('Images used by user pods') + + ts.panelOptions.withDescription( ||| - sum ( - # User pods are named "notebook" by kubespawner - kube_pod_container_info{container="notebook", namespace=~"$hub"} - ) by(image_spec, namespace) - |||, - legendFormat='{{image_spec}} (prod)', - ), -]); + Number of user servers using a container image. + ||| + ) + // legend_hideZero=false, + + ts.standardOptions.withDecimals(0) + // stack=false, + + ts.standardOptions.withMin(0) + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + ||| + sum ( + # User pods are named "notebook" by kubespawner + kube_pod_container_info{container="notebook", namespace=~"$hub"} + ) by(image_spec, namespace) + ||| + ) + + prometheus.withLegendFormat('{{ image_spec }} (prod)'), + ]); dashboard.new('JupyterHub Dashboard') + dashboard.withTags(['jupyterhub']) diff --git a/dashboards/jupyterhub.libsonnet b/dashboards/jupyterhub.libsonnet index 32ae1ba..6998ade 100644 --- a/dashboards/jupyterhub.libsonnet +++ b/dashboards/jupyterhub.libsonnet @@ -1,6 +1,6 @@ local grafonnet = import 'grafonnet/main.libsonnet'; -local graphPanel = grafonnet.graphPanel; -local prometheus = grafonnet.prometheus; +local ts = grafonnet.panel.timeSeries; +local prometheus = grafonnet.query.prometheus; { /* @@ -73,41 +73,42 @@ local prometheus = grafonnet.prometheus; * @param title The title of the graph panel. * @param metric The metric to be observed. * @param component The component to be measured (or excluded). Optional if `multi=true`, in which case it is an exclusion, otherwise required. - * @param formatY1 (optional) Passthrough `formatY1` to `graphPanel.new` - * @param decimalsY1 (optional) Passthrough `decimalsY1` to `graphPanel.new` + * @param formatY1 (optional) Passthrough `formatY1` to `ts.new` + * @param decimalsY1 (optional) Passthrough `decimalsY1` to `ts.new` * @param multi (default `false`) If true, do a multi-component chart instead of single-component. * The chart will have a legend table for each component. */ - componentResourcePanel(title, metric, component='', formatY1=null, decimalsY1=null, multi=false):: graphPanel.new( - title, - decimalsY1=decimalsY1, - formatY1=formatY1, + componentResourcePanel(title, metric, component='', formatY1=null, decimalsY1=null, multi=false):: + ts.new(title) + //decimalsY1=decimalsY1, + //formatY1=formatY1, // show legend as a table with current, avg, max values - legend_alignAsTable=true, - legend_current=true, - legend_avg=true, - legend_max=true, - legend_hideZero=true, + //legend_alignAsTable=true, + //legend_current=true, + //legend_avg=true, + //legend_max=true, + //legend_hideZero=true, // legend_values is required for any of the above to work - legend_values=true, - min=0, - ).addTargets([ - prometheus.target( - std.format( - ||| - sum( - %s - %s - ) by (label_component) - |||, - [ - metric, - self.onComponentLabel(component, cmp=if multi then '!=' else '=', group_left='container, label_component'), - ], - ), - legendFormat=if multi then '{{ label_component }}' else title, - ), - ],), + //legend_values=true, + //min=0, + + ts.queryOptions.withTargets([ + prometheus.new( + '$PROMETHEUS_DS', + std.format( + ||| + sum( + %s + %s + ) by (label_component) + |||, + [ + metric, + self.onComponentLabel(component, cmp=if multi then '!=' else '=', group_left='container, label_component'), + ], + ) + ) + + prometheus.withLegendFormat(if multi then '{{ label_component }}' else title), + ]), /** * Creates a memory (working set) graph panel for one (or more) JupyterHub component(s). @@ -119,18 +120,19 @@ local prometheus = grafonnet.prometheus; * @param multi (default `false`) If true, do a multi-component chart instead of single-component. * The chart will have a legend table for each component. */ - memoryPanel(name, component, multi=false):: self.componentResourcePanel( - std.format('%s Memory (Working Set)', [name]), - component=component, - metric=||| - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() reports double the actual metric - container_memory_working_set_bytes{name!=""} - |||, - formatY1='bytes', - multi=multi, - ), + memoryPanel(name, component, multi=false):: + self.componentResourcePanel( + std.format('%s Memory (Working Set)', [name]), + component=component, + metric=||| + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() reports double the actual metric + container_memory_working_set_bytes{name!=""} + |||, + formatY1='bytes', + multi=multi, + ), /** * Creates a CPU usage graph panel for one (or more) JupyterHub component(s). @@ -142,19 +144,19 @@ local prometheus = grafonnet.prometheus; * @param multi (default `false`) If true, do a multi-component chart instead of single-component. * The chart will have a legend table for each component. */ - cpuPanel(name, component, multi=false):: self.componentResourcePanel( - std.format('%s CPU', [name]), - component=component, - metric=||| - # exclude name="" because the same container can be reported - # with both no name and `name=k8s_...`, - # in which case sum() reports double the actual metric - irate(container_cpu_usage_seconds_total{name!=""}[5m]) - |||, - // decimals=1 with percentunit means round to nearest 10% - decimalsY1=1, - formatY1='percentunit', - multi=multi, - ), - + cpuPanel(name, component, multi=false):: + self.componentResourcePanel( + std.format('%s CPU', [name]), + component=component, + metric=||| + # exclude name="" because the same container can be reported + # with both no name and `name=k8s_...`, + # in which case sum() reports double the actual metric + irate(container_cpu_usage_seconds_total{name!=""}[5m]) + |||, + // decimals=1 with percentunit means round to nearest 10% + decimalsY1=1, + formatY1='percentunit', + multi=multi, + ), }