Skip to content

Commit

Permalink
feat(agent): metrics module WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
rezib committed Oct 9, 2024
1 parent 1bb742c commit c97d222
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 0 deletions.
6 changes: 6 additions & 0 deletions slurmweb/apps/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
from rfl.web.tokens import RFLTokenizedRBACWebApp
from racksdb.errors import RacksDBSchemaError, RacksDBFormatError
from racksdb.web.app import RacksDBWebBlueprint
from werkzeug.middleware import dispatcher

from . import SlurmwebWebApp
from ..version import get_version
from ..views import SlurmwebAppRoute
from ..views import agent as views
from ..slurmrestd import SlurmrestdFilteredCached
from ..metrics import register_collector
from ..cache import CachingService

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -104,3 +106,7 @@ def __init__(self, seed):
# Default RacksDB infrastructure is the cluster name.
if self.settings.racksdb.infrastructure is None:
self.settings.racksdb.infrastructure = self.settings.service.cluster

self.wsgi_app = dispatcher.DispatcherMiddleware(
self.wsgi_app, {"/metrics": register_collector(self.slurmrestd)}
)
37 changes: 37 additions & 0 deletions slurmweb/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024 Rackslab
#
# This file is part of Slurm-web.
#
# SPDX-License-Identifier: GPL-3.0-or-later

import prometheus_client

from prometheus_client import core


class SlurmWebCollector(prometheus_client.registry.Collector):
def __init__(self, slurmrestd):
self.slurmrestd = slurmrestd

def collect(self):
nodes_states = self.slurmrestd.nodes_states()
c = core.GaugeMetricFamily("slurm_nodes", "Slurm nodes", labels=["state"])
for status, value in nodes_states.items():
c.add_metric([status], value)
yield c

jobs_states = self.slurmrestd.jobs_states()
c = core.GaugeMetricFamily("slurm_jobs", "Slurm jobs", labels=["state"])
for status, value in jobs_states.items():
c.add_metric([status], value)
yield c


def register_collector(slurmrestd):
prometheus_client.REGISTRY.register(SlurmWebCollector(slurmrestd))
prometheus_client.REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
prometheus_client.REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
prometheus_client.REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
return prometheus_client.make_wsgi_app()

#curl --data-urlencode 'query=slurm_nodes{job="slurm"}[30m]' http://localhost:9090/api/v1/query | jq
52 changes: 52 additions & 0 deletions slurmweb/slurmrestd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,32 @@ def version(self, **kwargs):
def jobs(self, **kwargs):
return self._request(f"/slurm/v{self.api_version}/jobs", "jobs", **kwargs)

def jobs_states(self):
result = {
"running": 0,
"completed": 0,
"completing": 0,
"cancelled": 0,
"pending": 0,
"unknown": 0,
"total": 0,
}
for job in self.jobs():
if job["job_state"] == "RUNNING":
result["running"] += 1
elif job["job_state"] == "COMPLETED":
result["completed"] += 1
elif job["job_state"] == "COMPLETING":
result["completing"] += 1
elif job["job_state"] == "CANCELLED":
result["cancelled"] += 1
elif job["job_state"] == "PENDING":
result["pending"] += 1
else:
result["unknown"] += 1
result["total"] += 1
return result

def _ctldjob(self, job_id: int, **kwargs):
return self._request(
f"/slurm/v{self.api_version}/job/{job_id}", "jobs", **kwargs
Expand All @@ -108,6 +134,32 @@ def _acctjob(self, job_id: int, **kwargs):
def nodes(self, **kwargs):
return self._request(f"/slurm/v{self.api_version}/nodes", "nodes", **kwargs)

def nodes_states(self):
result = {
"idle": 0,
"mixed": 0,
"allocated": 0,
"down": 0,
"drain": 0,
"unknown": 0,
"total": 0,
}
for node in self.nodes():
if "MIXED" in node["state"]:
result["mixed"] += 1
elif "ALLOCATED" in node["state"]:
result["allocated"] += 1
elif "DOWN" in node["state"]:
result["down"] += 1
elif "DRAIN" in node["state"]:
result["drain"] += 1
elif "IDLE" in node["state"]:
result["idle"] += 1
else:
result["unknown"] += 1
result["total"] += 1
return result

def node(self, node_name: str, **kwargs):
try:
return self._request(
Expand Down

0 comments on commit c97d222

Please sign in to comment.