From c97d222008e2ad7dc7e8d619899d61ba69544b46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Wed, 2 Oct 2024 10:03:52 +0200 Subject: [PATCH] feat(agent): metrics module WIP --- slurmweb/apps/agent.py | 6 ++++ slurmweb/metrics.py | 37 +++++++++++++++++++++++ slurmweb/slurmrestd/__init__.py | 52 +++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 slurmweb/metrics.py diff --git a/slurmweb/apps/agent.py b/slurmweb/apps/agent.py index 859b3da4..cbdb4a62 100644 --- a/slurmweb/apps/agent.py +++ b/slurmweb/apps/agent.py @@ -10,12 +10,14 @@ from rfl.web.tokens import RFLTokenizedRBACWebApp from racksdb.errors import RacksDBSchemaError, RacksDBFormatError from racksdb.web.app import RacksDBWebBlueprint +from werkzeug.middleware import dispatcher from . import SlurmwebWebApp from ..version import get_version from ..views import SlurmwebAppRoute from ..views import agent as views from ..slurmrestd import SlurmrestdFilteredCached +from ..metrics import register_collector from ..cache import CachingService logger = logging.getLogger(__name__) @@ -104,3 +106,7 @@ def __init__(self, seed): # Default RacksDB infrastructure is the cluster name. if self.settings.racksdb.infrastructure is None: self.settings.racksdb.infrastructure = self.settings.service.cluster + + self.wsgi_app = dispatcher.DispatcherMiddleware( + self.wsgi_app, {"/metrics": register_collector(self.slurmrestd)} + ) diff --git a/slurmweb/metrics.py b/slurmweb/metrics.py new file mode 100644 index 00000000..21802ba5 --- /dev/null +++ b/slurmweb/metrics.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024 Rackslab +# +# This file is part of Slurm-web. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +import prometheus_client + +from prometheus_client import core + + +class SlurmWebCollector(prometheus_client.registry.Collector): + def __init__(self, slurmrestd): + self.slurmrestd = slurmrestd + + def collect(self): + nodes_states = self.slurmrestd.nodes_states() + c = core.GaugeMetricFamily("slurm_nodes", "Slurm nodes", labels=["state"]) + for status, value in nodes_states.items(): + c.add_metric([status], value) + yield c + + jobs_states = self.slurmrestd.jobs_states() + c = core.GaugeMetricFamily("slurm_jobs", "Slurm jobs", labels=["state"]) + for status, value in jobs_states.items(): + c.add_metric([status], value) + yield c + + +def register_collector(slurmrestd): + prometheus_client.REGISTRY.register(SlurmWebCollector(slurmrestd)) + prometheus_client.REGISTRY.unregister(prometheus_client.GC_COLLECTOR) + prometheus_client.REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR) + prometheus_client.REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR) + return prometheus_client.make_wsgi_app() + +#curl --data-urlencode 'query=slurm_nodes{job="slurm"}[30m]' http://localhost:9090/api/v1/query | jq diff --git a/slurmweb/slurmrestd/__init__.py b/slurmweb/slurmrestd/__init__.py index 1801149e..cb5e8e3e 100644 --- a/slurmweb/slurmrestd/__init__.py +++ b/slurmweb/slurmrestd/__init__.py @@ -95,6 +95,32 @@ def version(self, **kwargs): def jobs(self, **kwargs): return self._request(f"/slurm/v{self.api_version}/jobs", "jobs", **kwargs) + def jobs_states(self): + result = { + "running": 0, + "completed": 0, + "completing": 0, + "cancelled": 0, + "pending": 0, + "unknown": 0, + "total": 0, + } + for job in self.jobs(): + if job["job_state"] == "RUNNING": + result["running"] += 1 + elif job["job_state"] == "COMPLETED": + result["completed"] += 1 + elif job["job_state"] == "COMPLETING": + result["completing"] += 1 + elif job["job_state"] == "CANCELLED": + result["cancelled"] += 1 + elif job["job_state"] == "PENDING": + result["pending"] += 1 + else: + result["unknown"] += 1 + result["total"] += 1 + return result + def _ctldjob(self, job_id: int, **kwargs): return self._request( f"/slurm/v{self.api_version}/job/{job_id}", "jobs", **kwargs @@ -108,6 +134,32 @@ def _acctjob(self, job_id: int, **kwargs): def nodes(self, **kwargs): return self._request(f"/slurm/v{self.api_version}/nodes", "nodes", **kwargs) + def nodes_states(self): + result = { + "idle": 0, + "mixed": 0, + "allocated": 0, + "down": 0, + "drain": 0, + "unknown": 0, + "total": 0, + } + for node in self.nodes(): + if "MIXED" in node["state"]: + result["mixed"] += 1 + elif "ALLOCATED" in node["state"]: + result["allocated"] += 1 + elif "DOWN" in node["state"]: + result["down"] += 1 + elif "DRAIN" in node["state"]: + result["drain"] += 1 + elif "IDLE" in node["state"]: + result["idle"] += 1 + else: + result["unknown"] += 1 + result["total"] += 1 + return result + def node(self, node_name: str, **kwargs): try: return self._request(