Skip to content

Commit

Permalink
Merge pull request #839 from linsword13/slurm-status
Browse files Browse the repository at this point in the history
Fail more gracefully upon status check error
  • Loading branch information
douglasjacobsen authored Jan 24, 2025
2 parents d559b7e + 7ebbf4a commit 8a4be53
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
13 changes: 12 additions & 1 deletion lib/ramble/ramble/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,18 @@

experiment_status = Enum(
"experiment_status",
["UNKNOWN", "SETUP", "SUBMITTED", "RUNNING", "COMPLETE", "SUCCESS", "FAILED", "CANCELLED"],
[
"UNKNOWN",
# unresolved means the status is not fetched successfully
"UNRESOLVED",
"SETUP",
"SUBMITTED",
"RUNNING",
"COMPLETE",
"SUCCESS",
"FAILED",
"CANCELLED",
],
)

_NULL_CONTEXT = "null"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def get_status(self, workspace):
expander = self.app_inst.expander
run_dir = expander.expand_var_name("experiment_run_dir")
job_id_file = os.path.join(run_dir, ".slurm_job")
status = experiment_status.UNKNOWN
status = experiment_status.UNRESOLVED
if not os.path.isfile(job_id_file):
logger.warn("job_id file is missing")
return status
Expand Down Expand Up @@ -235,7 +235,13 @@ def get_status(self, job_id):
if not status_out:
self._ensure_runner("sacct")
sacct_args = ["-o", "state", "-X", "-n", "-j", job_id]
status_out = self.sacct_runner.command(*sacct_args, output=str)
try:
status_out = self.sacct_runner.command(*sacct_args, output=str)
except ProcessError as e:
status_out = ""
logger.debug(
f"sacct returns error {e}. The status is not resolved correctly."
)
return status_out.strip()

def get_partitions(self):
Expand Down

0 comments on commit 8a4be53

Please sign in to comment.