Skip to content

Commit

Permalink
fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
BO SANG committed Jan 2, 2025
1 parent 7173753 commit 111fe18
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
1 change: 1 addition & 0 deletions dlrover/python/diagnosis/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class InferenceConfigKey(object):

class DiagnosisConstant(object):
MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180
# the minimum diagnosis interval is 5 seconds
AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 5
AGENT_PERIODICALLY_REPORT_INTERVAL_SECS = 15
MASTER_INSTANCE = -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@

class ResolveGPUErrorsOperator(InferenceOperator):
"""
ResolveGPUErrorsOperator is to diagnose GPU errors
ResolveGPUErrorsOperator is to diagnose GPU errors like:
1. GPU lost. This error will be reported to the master and
the master will expose this error at this moment.
"""

def __init__(self):
Expand Down
10 changes: 5 additions & 5 deletions dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@

class DiagnosisAgent(Singleton):
def __init__(
self,
training_log_file="",
errors="",
rank=-1,
self,
training_log_file="",
errors="",
rank=-1,
):
self._client = MasterClient.singleton_instance()
self._training_log_file = training_log_file
Expand Down Expand Up @@ -168,7 +168,7 @@ def _observe(self, observe_problems: List[Inference]) -> List[Inference]:
logger.error(f"fail to observe problem {problem}: {e}")
new_obs: List[Inference] = []
for ob in observations:
if not is_inference_included(self._observe_problems, ob):
if not is_inference_included(observe_problems, ob):
new_obs.append(ob)
return new_obs

Expand Down

0 comments on commit 111fe18

Please sign in to comment.