diff --git a/sisyphus/global_settings.py b/sisyphus/global_settings.py index 1bf2e25..8d8173e 100644 --- a/sisyphus/global_settings.py +++ b/sisyphus/global_settings.py @@ -60,6 +60,31 @@ def worker_wrapper(job, task_name, call): return call +def on_job_failure(job): + """ + Job failure hook. + + Can be used for generic job-independent error monitoring, handling or retry + logic. + + Sispyhus will call this function w/ the job instance for any failed job. + + The callback itself is then responsible for any retry logic, realized by e.g. + analyzing the job log file and removing error files in the job directory as + needed. + + The callback needs to be stateless and indempotent, as it can be called multiple + times on the same job, especially if the job remains in the error state after the + callback has finished. + + Do: + - use with caution + - ensure you don't build infinite retry loops + - limit to specific use cases (e.g. local disk full, GPU broken, etc.) + """ + pass + + def update_engine_rqmt(last_rqmt: Dict, last_usage: Dict): """Update requirements after a job got interrupted, double limits if needed diff --git a/sisyphus/manager.py b/sisyphus/manager.py index 749daf3..d3539bc 100644 --- a/sisyphus/manager.py +++ b/sisyphus/manager.py @@ -620,6 +620,9 @@ def run(self): self.resume_jobs() self.run_jobs() + for job in self.jobs.get(gs.STATE_ERROR, []): + gs.on_job_failure(job) + # Stop config reader config_manager.cancel_all_reader()