Skip to content

Commit

Permalink
export start event
Browse files Browse the repository at this point in the history
  • Loading branch information
liyanzhe.lyz committed Jan 22, 2025
1 parent 1a15252 commit 947544f
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 2 deletions.
6 changes: 6 additions & 0 deletions dlrover/python/master/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from dlrover.python.master.args import parse_master_args
from dlrover.python.scheduler.factory import new_job_args
from dlrover.python.scheduler.job import JobArgs
from dlrover.python.training_event import DLRoverMaster

_dlrover_context = Context.singleton_instance()

Expand Down Expand Up @@ -62,6 +63,11 @@ def run(args):

def main():
args = parse_master_args()

# export event for dlrover master
master = DLRoverMaster()
master.start(pid=vars(args))

Check warning on line 69 in dlrover/python/master/main.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/master/main.py#L68-L69

Added lines #L68 - L69 were not covered by tests

exit_code = run(args)
return exit_code

Expand Down
5 changes: 3 additions & 2 deletions dlrover/python/training_event/predefined/dlrover.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, List

from .common import CommonPredefined
from dlrover.python.common.singleton import Singleton


class DLRoverCommonEventName(Enum):
Expand Down Expand Up @@ -133,7 +134,7 @@ def exit(self, reason: str, **kwargs):
)


class DLRoverMaster(DLRoverCommon):
class DLRoverMaster(DLRoverCommon, Singleton):
"""DLRover Master events"""

def __init__(self) -> None:
Expand Down Expand Up @@ -193,7 +194,7 @@ def fault_detect(self, reason: str, **kwargs):
)


class DLRoverAgent(DLRoverCommon):
class DLRoverAgent(DLRoverCommon, Singleton):
"""DLRover Agent events"""

def __init__(self) -> None:
Expand Down
5 changes: 5 additions & 0 deletions dlrover/trainer/torch/elastic_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
launch_agent,
)
from dlrover.trainer.torch.utils import version_less_than_230
from dlrover.python.training_event import DLRoverAgent


def parse_args(args):
Expand Down Expand Up @@ -446,6 +447,10 @@ def _check_to_use_dlrover_run(master_addr, max_nodes, timeout=120):


def run(args):
# export event for dlrover agent
agent = DLRoverAgent()
agent.start(pid=vars(args))

Check warning on line 452 in dlrover/trainer/torch/elastic_run.py

View check run for this annotation

Codecov / codecov/patch

dlrover/trainer/torch/elastic_run.py#L451-L452

Added lines #L451 - L452 were not covered by tests

logger.info(f"DLRover agent started with: {cu.get_dlrover_version()}.")
master_handler = None
master_addr = os.getenv(NodeEnv.DLROVER_MASTER_ADDR, "")
Expand Down

0 comments on commit 947544f

Please sign in to comment.