Skip to content

Commit

Permalink
ut fix
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi committed Jan 22, 2025
1 parent 560ea1d commit 0bdb747
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions dlrover/python/tests/test_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ def test_relaunch_node(self):
manager = create_job_manager(params, SpeedMonitor())
self.assertEqual(manager._ps_relaunch_max_num, 1)
manager.start()

# reset failed nodes for testing
self.job_context._failed_nodes = {}
self.assertEqual(manager._job_args.job_uuid, _MOCK_JOB_UUID)

job_nodes = self.job_context.job_nodes()
Expand Down Expand Up @@ -296,18 +299,18 @@ def test_relaunch_node(self):
should_relaunch = manager._should_relaunch(node, NODE_STATE_FLOWS[6])
self.assertFalse(should_relaunch)

self.assertEqual(self.job_context.get_failed_node_cnt(), 2)
self.assertEqual(self.job_context.get_failed_node_cnt(), 0)
manager.handle_training_failure(
NodeType.WORKER, 0, level=TrainingExceptionLevel.NODE_ERROR
)
manager.handle_training_failure(
NodeType.WORKER, 0, level=TrainingExceptionLevel.NODE_ERROR
)
self.assertEqual(self.job_context.get_failed_node_cnt(), 3)
self.assertEqual(self.job_context.get_failed_node_cnt(), 1)
manager.handle_training_failure(
NodeType.WORKER, 1, level=TrainingExceptionLevel.NODE_ERROR
)
self.assertEqual(self.job_context.get_failed_node_cnt(), 3)
self.assertEqual(self.job_context.get_failed_node_cnt(), 2)

def test_relaunch_under_deleted_event(self):
params = MockK8sPSJobArgs()
Expand Down

0 comments on commit 0bdb747

Please sign in to comment.