diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index a0309562e..a55a2a864 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -33,6 +33,7 @@ class InferenceConfigKey(object): class DiagnosisConstant(object): MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180 + # the minimum diagnosis interval is 5 seconds AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 5 AGENT_PERIODICALLY_REPORT_INTERVAL_SECS = 15 MASTER_INSTANCE = -1 diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_gpu_errors_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_gpu_errors_operator.py index a604e74d1..3bf2f3c56 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_gpu_errors_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_gpu_errors_operator.py @@ -32,7 +32,9 @@ class ResolveGPUErrorsOperator(InferenceOperator): """ - ResolveGPUErrorsOperator is to diagnose GPU errors + ResolveGPUErrorsOperator is to diagnose GPU errors like: + 1. GPU lost. This error will be reported to the master and + the master will expose this error at this moment. """ def __init__(self): diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 35b1a107a..f0808b18b 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -57,10 +57,10 @@ class DiagnosisAgent(Singleton): def __init__( - self, - training_log_file="", - errors="", - rank=-1, + self, + training_log_file="", + errors="", + rank=-1, ): self._client = MasterClient.singleton_instance() self._training_log_file = training_log_file @@ -168,7 +168,7 @@ def _observe(self, observe_problems: List[Inference]) -> List[Inference]: logger.error(f"fail to observe problem {problem}: {e}") new_obs: List[Inference] = [] for ob in observations: - if not is_inference_included(self._observe_problems, ob): + if not is_inference_included(observe_problems, ob): new_obs.append(ob) return new_obs