Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alexleung/dev branch #1715

Merged
merged 2 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions devops/scripts/build-fedml-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ if [ "$build_arm_arch_images" != "" ]; then
cd $pwd
fi

cd ./installation/build_fedml_docker
docker build -f light/Dockerfile \
docker build -f ./installation/build_fedml_docker/light/Dockerfile \
--network=host \
-t ${DOCKER_REGISTRY}/fedml/fedml:light .
cd $pwd
8 changes: 7 additions & 1 deletion installation/build_fedml_docker/light/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@ ENV HOME /home/$USER
##############################################################################
# Add docker location file
##############################################################################
ADD ./light/docker-location.yml /home/fedml/fedml-client/fedml/data/docker-location.yml
ADD ./installation/build_fedml_docker/light/docker-location.yml /home/fedml/fedml-client/fedml/data/docker-location.yml

RUN pip3 install fedml

COPY ./python ./fedml/fedml-pip
WORKDIR ./fedml/fedml-pip
RUN pip3 install -e ./




2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.12a58"
__version__ = "0.8.12a60"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down
43 changes: 29 additions & 14 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,20 +466,34 @@ def monitor_slave_run_process_status(self):
count += 1
if count >= 1000:
break
all_run_processes_exited = True

# Calc the timeout
started_time = int(float(job.started_time))
timeout = time.time() - started_time

# Check if all processes of the specific run are exited
run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id)
for run_process_id in run_process_list:
try:
process = psutil.Process(int(run_process_id))
except Exception as e:
process = None
pass
if process is not None:
all_run_processes_exited = False

# If the run processes have exited but run status is not completed,
# then release gpu ids and report failed status to the master agent.
if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status):
all_run_processes_exited = True if len(run_process_list) <= 0 else False

# Get the timeout threshold
timeout_threshold = None
if job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_PROVISIONING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_QUEUED:
timeout_threshold = SchedulerConstants.TRAIN_PROVISIONING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_STARTING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING:
timeout_threshold = SchedulerConstants.TRAIN_STARTING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_TRAINING or \
job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_RUNNING:
timeout_threshold = SchedulerConstants.TRAIN_RUNNING_TIMEOUT
elif job.status == client_constants.ClientConstants.MSG_MLOPS_RUN_STATUS_STOPPING:
timeout_threshold = SchedulerConstants.TRAIN_STOPPING_TIMEOUT

# If the run processes have exited but run status is not completed and
# timeout is out of the range, then release gpu ids and report failed status to the master agent.
if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \
timeout_threshold is not None and timeout > timeout_threshold:
# Release the gpu ids
self.release_gpu_ids(job.job_id, job.edge_id)

Expand Down Expand Up @@ -509,7 +523,8 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None
timeout_threshold = None
if job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_PROVISIONING:
timeout_threshold = SchedulerConstants.TRAIN_PROVISIONING_TIMEOUT
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING:
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING or \
job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING:
timeout_threshold = SchedulerConstants.TRAIN_STARTING_TIMEOUT
elif job.status == server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING:
timeout_threshold = SchedulerConstants.TRAIN_RUNNING_TIMEOUT
Expand Down
4 changes: 1 addition & 3 deletions python/fedml/computing/scheduler/master/server_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2500,9 +2500,7 @@ def setup_agent_mqtt_connection(self, service_config):
if not self.run_as_cloud_server:
self.recover_start_train_msg_after_upgrading()

JobRunnerUtils.get_instance().sync_run_process_gpu()
JobRunnerUtils.get_instance().sync_endpoint_process_gpu()
JobRunnerUtils.get_instance().reset_available_gpu_id_list(self.edge_id)
JobRunnerUtils.get_instance().sync_data_on_startup(self.edge_id)

self.master_api_daemon = MasterApiDaemon()
self.master_api_process = Process(target=self.master_api_daemon.run)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_d
_, env_redis_addr, env_redis_port, env_redis_pwd = \
SchedulerConstants.get_redis_and_infer_host_env_addr()
redis_addr = env_redis_addr if env_redis_addr is not None else redis_addr
redis_addr = "localhost" if redis_addr is not None and redis_addr == "local" else redis_addr
redis_port = env_redis_port if env_redis_port is not None else redis_port
redis_password = env_redis_pwd if env_redis_pwd is not None else redis_password

Expand All @@ -50,7 +51,7 @@ def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_d
self.redis_pool = redis.ConnectionPool(host=redis_addr, port=int(redis_port),
password=redis_password, decode_responses=True)
self.redis_connection = redis.Redis(connection_pool=self.redis_pool)
self.redis_connection.exists("FEDML_KEYS")
self.redis_connection.set("FEDML_TEST_KEYS", "TEST")
is_connected = True
except Exception as e:
is_connected = False
Expand All @@ -65,7 +66,7 @@ def setup_public_redis_connection(self):
host=SchedulerConstants.PUBLIC_REDIS_ADDR, port=SchedulerConstants.PUBLIC_REDIS_PORT,
password=SchedulerConstants.PUBLIC_REDIS_PASSWORD, decode_responses=True)
self.redis_connection = redis.Redis(connection_pool=self.redis_pool)
self.redis_connection.exists("FEDML_KEYS")
self.redis_connection.set("FEDML_TEST_KEYS", "TEST")
is_connected = True
except Exception as e:
pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_d
_, env_redis_addr, env_redis_port, env_redis_pwd = \
SchedulerConstants.get_redis_and_infer_host_env_addr()
redis_addr = env_redis_addr if env_redis_addr is not None else redis_addr
redis_addr = "localhost" if redis_addr is not None and redis_addr == "local" else redis_addr
redis_port = env_redis_port if env_redis_port is not None else redis_port
redis_password = env_redis_pwd if env_redis_pwd is not None else redis_password

Expand All @@ -37,7 +38,7 @@ def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_d
self.redis_pool = redis.ConnectionPool(host=redis_addr, port=int(redis_port),
password=redis_password, decode_responses=True)
self.redis_connection = redis.Redis(connection_pool=self.redis_pool)
self.redis_connection.exists("FEDML_KEYS")
self.redis_connection.set("FEDML_TEST_KEYS", "TEST")
self.gpu_cache.redis_connection = self.redis_connection
self.logs_cache.redis_connection = self.redis_connection
is_connected = True
Expand All @@ -54,7 +55,7 @@ def setup_public_redis_connection(self):
host=SchedulerConstants.PUBLIC_REDIS_ADDR, port=SchedulerConstants.PUBLIC_REDIS_PORT,
password=SchedulerConstants.PUBLIC_REDIS_PASSWORD, decode_responses=True)
self.redis_connection = redis.Redis(connection_pool=self.redis_pool)
self.redis_connection.exists("FEDML_KEYS")
self.redis_connection.set("FEDML_TEST_KEYS", "TEST")
self.gpu_cache.redis_connection = self.redis_connection
self.logs_cache.redis_connection = self.redis_connection
is_connected = True
Expand Down
1 change: 1 addition & 0 deletions python/fedml/computing/scheduler/slave/client_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

class ClientConstants(object):
MSG_MLOPS_CLIENT_STATUS_OFFLINE = "OFFLINE"
MSG_MLOPS_CLIENT_STATUS_PROVISIONING = "PROVISIONING"
MSG_MLOPS_CLIENT_STATUS_IDLE = "IDLE"
MSG_MLOPS_CLIENT_STATUS_UPGRADING = "UPGRADING"
MSG_MLOPS_CLIENT_STATUS_QUEUED = "QUEUED"
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.12a58",
version="0.8.12a60",
author="FedML Team",
author_email="[email protected]",
description="A research and production integrated edge-cloud library for "
Expand Down
Loading