Skip to content

Commit

Permalink
wip debug
Browse files Browse the repository at this point in the history
Signed-off-by: Karl W. Schulz <[email protected]>
  • Loading branch information
koomie committed Dec 12, 2024
1 parent 3337753 commit 0c1aea0
Showing 1 changed file with 62 additions and 36 deletions.
98 changes: 62 additions & 36 deletions test/test_job_user_push.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,6 @@
$OMNISTAT_DIR/omnistat-usermode --start --interval 0.5
$cmd
ps uxw
curl localhost:8428/api/v1/query?query=rmsjob_info
curl localhost:8001/metrics
ls -ltr /tmp/
cat /tmp/omni*
$OMNISTAT_DIR/omnistat-usermode --stop
"""

Expand All @@ -65,39 +60,54 @@ def test_job_multinode(self):
job_seconds = 2
jobid = self.run_job(config.nodes, job_seconds)

# return
# time.sleep(2)
self.start_victoria_proxy()
time.sleep(2)

cmd = ["docker", "exec", "slurm-controller", "ps", "ux"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "curl", "localhost:9090/api/v1/query?query=rmsjob_info"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/victoria_server.log"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/exporter.log"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/slurm-1.out"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "ls", "-l","/jobs"]
p = subprocess.run(cmd)
print(p.stdout)

cmd = ["docker", "exec", "slurm-controller", "find", "/jobs/vic-data"]
p = subprocess.run(cmd)
print(p.stdout)

## cmd = ["docker", "exec", "slurm-controller", "ps", "ux"]
## p = subprocess.run(cmd)
## print(p.stdout)
##
## cmd = ["docker", "exec", "slurm-controller", "curl", 'localhost:9090/api/v1/query?query=rmsjob_info']
## p = subprocess.run(cmd)
## print(p.stdout)

##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/victoria_server.log"]
##### p = subprocess.run(cmd)
##### print(p.stdout)
#####
##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/exporter.log"]
##### p = subprocess.run(cmd)
##### print(p.stdout)
#####
##### cmd = ["docker", "exec", "slurm-controller", "cat", "/jobs/slurm-1.out"]
##### p = subprocess.run(cmd)
##### print(p.stdout)
#####
##### cmd = ["docker", "exec", "slurm-controller", "ls", "-l","/jobs"]
##### p = subprocess.run(cmd)
##### print(p.stdout)
# return

## cmd = ["docker", "exec", "slurm-controller", "find", "/jobs/vic-data"]
## p = subprocess.run(cmd)
## print(p.stdout)

# return

# verify we see rmsjob_info metric; note that VM can take some time on first startup before it successfully reports
# data via api/v1 queries. Hence, we try multiple times here before giving up.

prometheus = PrometheusConnect(url=config.victoria_url)
results = prometheus.custom_query("rmsjob_info")
wait_interval = 2.0
for i in range(15):
results = prometheus.custom_query("rmsjob_info")
if len(results) == 0:
print("sleeping to wait for VM to deliver...")
time.sleep(wait_interval)
else:
break

assert len(results) >= 1, "Metric rmsjob_info not available"

query = f"rmsjob_info{{jobid='{jobid}'}}[{config.time_range}]"
Expand Down Expand Up @@ -139,12 +149,28 @@ def start_victoria_proxy(self, data_path=config.victoria_data_user):
"-d",
"slurm-controller",
"victoria-metrics",
# "/tmp/victoria-metrics-prod",
"-httpListenAddr=:9090",
# "-search.disableCache",
# "-search.maxStalenessInterval=15m",
# "-search.resetRollupResultCacheOnStartup",
f"--storageDataPath={data_path}",
]
p = subprocess.run(start_cmd)
assert p.returncode == 0

time.sleep(2)
# cmd = ["curl", "localhost:9090/internal/force_flush"]
# p = subprocess.run(cmd)
# cmd = ["curl", "localhost:9090/internal/force_merge"]
# p = subprocess.run(cmd)
cmd = ["curl", "localhost:9090/internal/resetRollupResultCache"]
p = subprocess.run(cmd)
# cmd = ["curl", 'localhost:9090/api/v1/query?query=rmsjob_info']
# p = subprocess.run(cmd)

# time.sleep(5)

def stop_victoria_proxy(self, ignore_errors=False):
stop_cmd = ["docker", "exec", "slurm-controller", "pkill", "-f", "-SIGTERM", "victoria-metrics"]
p = subprocess.run(stop_cmd)
Expand Down Expand Up @@ -184,5 +210,5 @@ def run_job(self, nodes, seconds):
for pattern in patterns:
assert re.search(pattern, p.stdout) != None, f"Missing expected pattern\n{p.stdout}"

self.remove_job_file()
# self.remove_job_file()
return jobid

0 comments on commit 0c1aea0

Please sign in to comment.