Skip to content

Commit

Permalink
add ut
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi committed Jan 21, 2025
1 parent a6e9b12 commit a249fa4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 3 deletions.
11 changes: 8 additions & 3 deletions dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,13 @@ def _set_paral_config():
def _get_local_ip():
local_ip = os.getenv("POD_IP", "")
if not local_ip:
local_ip = socket.gethostbyname(_get_fq_hostname())
try:
local_ip = socket.gethostbyname(_get_fq_hostname())
except socket.gaierror:
logger.warning(

Check warning on line 141 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L140-L141

Added lines #L140 - L141 were not covered by tests
"Can not resolve host IP. " "Use default '127.0.0.1' instead."
)
local_ip = "127.0.0.1"

Check warning on line 144 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L144

Added line #L144 was not covered by tests
return local_ip


Expand Down Expand Up @@ -1315,8 +1321,7 @@ def launch_agent(
if (
(exc_type is not None)
or (result is not None and result.is_failed())
and not is_node_check_failed
):
) and not is_node_check_failed:
client.report_failed_exited()
logger.info("Failed and exit.")
elif is_node_check_failed:
Expand Down
33 changes: 33 additions & 0 deletions dlrover/python/tests/test_elastic_training_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@
ElasticTrainingAgent,
MasterRendezvousHandler,
NodeCheckElasticAgent,
NodeCheckFailedError,
RendezvousOutSyncError,
_create_check_agent,
_create_worker_spec,
_get_local_ip,
_set_paral_config,
comm_perf_check,
launch_agent,
node_health_check,
)
from dlrover.python.tests.test_utils import start_local_master
Expand Down Expand Up @@ -678,6 +680,37 @@ def test_diagnosis(self):
1,
)

@patch(
"dlrover.python.elastic_agent.master_client"
".MasterClient.report_failed_exited"
)
@patch(
"dlrover.python.elastic_agent.torch.training"
".ElasticTrainingAgent.run"
)
def test_node_status_report(self, mock_run, mock_report_failed_exited):
config = ElasticLaunchConfig(1, 1, 1)
entrypoint = "python"

mock_run.side_effect = RuntimeError("test")
mock_report_failed_exited.return_value = True
try:
launch_agent(config, entrypoint, [])
self.fail()

Check warning on line 699 in dlrover/python/tests/test_elastic_training_agent.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/tests/test_elastic_training_agent.py#L699

Added line #L699 was not covered by tests
except RuntimeError:
self.assertTrue(True)
mock_run.assert_called_once()
mock_report_failed_exited.assert_called_once()

mock_run.side_effect = NodeCheckFailedError("test")
try:
launch_agent(config, entrypoint, [])
self.fail()

Check warning on line 708 in dlrover/python/tests/test_elastic_training_agent.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/tests/test_elastic_training_agent.py#L708

Added line #L708 was not covered by tests
except NodeCheckFailedError:
self.assertTrue(True)
self.assertEqual(mock_run.call_count, 2)
mock_report_failed_exited.assert_called_once()


class NodeCheckElasticAgentTest(unittest.TestCase):
def setUp(self) -> None:
Expand Down

0 comments on commit a249fa4

Please sign in to comment.