From 0c1aea0a2c9d141e68d9f9361d27520ad3964227 Mon Sep 17 00:00:00 2001 From: "Karl W. Schulz" Date: Thu, 12 Dec 2024 10:48:11 -0600 Subject: [PATCH] wip debug Signed-off-by: Karl W. Schulz --- test/test_job_user_push.py | 98 ++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/test/test_job_user_push.py b/test/test_job_user_push.py index a3d0bf9e..19e02126 100644 --- a/test/test_job_user_push.py +++ b/test/test_job_user_push.py @@ -49,11 +49,6 @@ $OMNISTAT_DIR/omnistat-usermode --start --interval 0.5 $cmd -ps uxw -curl localhost:8428/api/v1/query?query=rmsjob_info -curl localhost:8001/metrics -ls -ltr /tmp/ -cat /tmp/omni* $OMNISTAT_DIR/omnistat-usermode --stop """ @@ -65,39 +60,54 @@ def test_job_multinode(self): job_seconds = 2 jobid = self.run_job(config.nodes, job_seconds) +# return +# time.sleep(2) self.start_victoria_proxy() - time.sleep(2) - - cmd = ["docker", "exec", "slurm-controller", "ps", "ux"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "curl", "localhost:9090/api/v1/query?query=rmsjob_info"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/victoria_server.log"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/exporter.log"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/slurm-1.out"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "ls", "-l","/jobs"] - p = subprocess.run(cmd) - print(p.stdout) - - cmd = ["docker", "exec", "slurm-controller", "find", "/jobs/vic-data"] - p = subprocess.run(cmd) - print(p.stdout) +## cmd = ["docker", "exec", "slurm-controller", "ps", "ux"] +## p = subprocess.run(cmd) +## print(p.stdout) +## +## cmd = ["docker", "exec", "slurm-controller", "curl", 'localhost:9090/api/v1/query?query=rmsjob_info'] +## p = subprocess.run(cmd) +## print(p.stdout) + +##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/victoria_server.log"] +##### p = subprocess.run(cmd) +##### print(p.stdout) +##### +##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/exporter.log"] +##### p = subprocess.run(cmd) +##### print(p.stdout) +##### +##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/slurm-1.out"] +##### p = subprocess.run(cmd) +##### print(p.stdout) +##### +##### cmd = ["docker", "exec", "slurm-controller", "ls", "-l","/jobs"] +##### p = subprocess.run(cmd) +##### print(p.stdout) +# return + +## cmd = ["docker", "exec", "slurm-controller", "find", "/jobs/vic-data"] +## p = subprocess.run(cmd) +## print(p.stdout) + +# return + + # verify we see rmsjob_info metric; note that VM can take some time on first startup before it successfully reports + # data via api/v1 queries. Hence, we try multiple times here before giving up. + prometheus = PrometheusConnect(url=config.victoria_url) - results = prometheus.custom_query("rmsjob_info") + wait_interval = 2.0 + for i in range(15): + results = prometheus.custom_query("rmsjob_info") + if len(results) == 0: + print("sleeping to wait for VM to deliver...") + time.sleep(wait_interval) + else: + break + assert len(results) >= 1, "Metric rmsjob_info not available" query = f"rmsjob_info{{jobid='{jobid}'}}[{config.time_range}]" @@ -139,12 +149,28 @@ def start_victoria_proxy(self, data_path=config.victoria_data_user): "-d", "slurm-controller", "victoria-metrics", +# "/tmp/victoria-metrics-prod", "-httpListenAddr=:9090", +# "-search.disableCache", +# "-search.maxStalenessInterval=15m", +# "-search.resetRollupResultCacheOnStartup", f"--storageDataPath={data_path}", ] p = subprocess.run(start_cmd) assert p.returncode == 0 + time.sleep(2) +# cmd = ["curl", "localhost:9090/internal/force_flush"] +# p = subprocess.run(cmd) +# cmd = ["curl", "localhost:9090/internal/force_merge"] +# p = subprocess.run(cmd) + cmd = ["curl", "localhost:9090/internal/resetRollupResultCache"] + p = subprocess.run(cmd) +# cmd = ["curl", 'localhost:9090/api/v1/query?query=rmsjob_info'] +# p = subprocess.run(cmd) + +# time.sleep(5) + def stop_victoria_proxy(self, ignore_errors=False): stop_cmd = ["docker", "exec", "slurm-controller", "pkill", "-f", "-SIGTERM", "victoria-metrics"] p = subprocess.run(stop_cmd) @@ -184,5 +210,5 @@ def run_job(self, nodes, seconds): for pattern in patterns: assert re.search(pattern, p.stdout) != None, f"Missing expected pattern\n{p.stdout}" - self.remove_job_file() +# self.remove_job_file() return jobid