Skip to content

Commit

Permalink
update log format
Browse files Browse the repository at this point in the history
  • Loading branch information
BO SANG committed Jan 16, 2025
1 parent f350f4b commit b648ea4
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 4 deletions.
4 changes: 2 additions & 2 deletions dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def diagnose_training_failure(self) -> NodeAction:
def _report_failure_to_master(self, failures, restart_count):
errors = {}
if len(failures) == 0:
logger.info("there is no failures. skip failures report")
logger.info("Skip failure report due to empty failures")
return
for rank, failure in failures.items():
dt = str(datetime.fromtimestamp(int(failure.timestamp)))
Expand All @@ -288,7 +288,7 @@ def send_heartbeat(self):
action = self._client.report_heart_beat(ts)
self._agent_context.enqueue_diagnosis_action(action)
except Exception as e:
logger.warning(f"fail to report a heartbeat: {e}")
logger.warning(f"Fail to report a heartbeat: {e}")

Check warning on line 291 in dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py#L291

Added line #L291 was not covered by tests

def _periodically_report(self):
logger.info("Start diagnosis agent periodically reporter.")
Expand Down
3 changes: 1 addition & 2 deletions dlrover/python/master/node/dist_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,9 +1198,8 @@ def handle_training_failure(
):
"""Process the training failure reported by the node."""
node = self._job_context.job_node(node_type, node_id)
logger.info(f"handle failed node: {node}")
logger.info(f"Handle failed node: {node}")
if node.is_released:
logger.info(f"The node {node.name} has been released.")
return
relaunch_node = self._process_error(
node, restart_count, error_data, level
Expand Down

0 comments on commit b648ea4

Please sign in to comment.