Skip to content

Commit

Permalink
fix network check status report
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi committed Jan 21, 2025
1 parent 2c5f8e4 commit d101246
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -1424,17 +1424,19 @@ def run(self, role: str = DEFAULT_ROLE) -> bool:
f"Network check time of round {i} is {elapsed_time}"
f" and succeed is {result}."
)

success = success or result
status = (
NodeEventType.NODE_CHECK_SUCCEEDED
if result
if success
else NodeEventType.NODE_CHECK_FAILED
)
self._client.report_network_check_status(
self._node_rank,
status,
elapsed_time,
)
success = success or result

fault_nodes, fault_reason = self._client.check_fault_node(
timeout=self._get_check_node_timeout()
)
Expand Down Expand Up @@ -1474,6 +1476,9 @@ def run(self, role: str = DEFAULT_ROLE) -> bool:
logger.warning("This node is a straggler!")
if self._config.exclude_straggler:
raise RuntimeError("The node is a straggler and exits.")



return success

def _run_node_check(self, monitor_interval=3, timeout=300):
Expand Down

0 comments on commit d101246

Please sign in to comment.