Skip to content

Commit

Permalink
Add warning when squeue is timing out
Browse files Browse the repository at this point in the history
Signed-off-by: Jordà Polo <[email protected]>
  • Loading branch information
jordap authored and koomie committed Jan 13, 2025
1 parent b30b431 commit 02ee8a9
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions omnistat/collector_rms.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,13 @@ def querySlurmJob(self, timeout=1, exit_on_error=False, mode="squeue"):

if mode == "squeue":
data = utils.runShellCommand(self.__squeue_query, timeout=timeout, exit_on_error=exit_on_error)
if data == None:
logging.warning(
"Failed to capture job information: squeue timed out. "
"Please increase sampling interval or switch to file-based mode."
)
# squeue query output format: JOBID:USER:PARTITION:NUM_NODES:BATCHFLAG
if data and data.stdout.strip():
elif data.stdout.strip():
data = data.stdout.strip().split(":")
keys = [
"RMS_JOB_ID",
Expand All @@ -111,9 +116,14 @@ def querySlurmJob(self, timeout=1, exit_on_error=False, mode="squeue"):
results["RMS_TYPE"] = "slurm"

# require a 2nd query to ascertain job steps (otherwise, miss out on batchflag)
data = utils.runShellCommand(self.__squeue_steps, timeout=timeout, exit_on_error=exit_on_error)
results["RMS_STEP_ID"] = -1
if data and data.stdout.strip():
data = utils.runShellCommand(self.__squeue_steps, timeout=timeout, exit_on_error=exit_on_error)
if data == None:
logging.warning(
"Failed to capture job step information: squeue timed out. "
"Please increase sampling interval or switch to file-based mode."
)
elif data.stdout.strip():
# If we are in an active job step, the STEPID will have an integer index appended, e.g.
# 57735.10
# 57735.interactive
Expand Down

0 comments on commit 02ee8a9

Please sign in to comment.