From 01b98626e653b3bb9adf887a741dc7a9f530d2f2 Mon Sep 17 00:00:00 2001 From: zhangjiajin Date: Tue, 12 Apr 2022 01:14:04 +0000 Subject: [PATCH] bug fix: report, horovod, docs --- docs/cn/user/config_reference.md | 3 ++- docs/cn/user/faq.md | 24 ++++++++--------- docs/en/user/config_reference.md | 3 ++- docs/en/user/faq.md | 26 +++++++++---------- .../pipeline/horovod/run_horovod_train.sh | 2 +- vega/core/pipeline/horovod_train_step.py | 1 + vega/report/report_server.py | 12 ++++----- 7 files changed, 37 insertions(+), 34 deletions(-) diff --git a/docs/cn/user/config_reference.md b/docs/cn/user/config_reference.md index df3429f..1b11503 100644 --- a/docs/cn/user/config_reference.md +++ b/docs/cn/user/config_reference.md @@ -158,7 +158,8 @@ fully_train: common: data_path: /cache/datasets/cifar10/ ``` -**注**: HCCL支持多机多卡,Horovod目前只支持单机多卡。 + +**注**: 使用Horovod时,集群里的各节点间都需要配置SSH互信,且确保各主机的python版本和路径一致。 ## 3. NAS和HPO配置项 diff --git a/docs/cn/user/faq.md b/docs/cn/user/faq.md index 28aee9c..b390e4e 100644 --- a/docs/cn/user/faq.md +++ b/docs/cn/user/faq.md @@ -108,22 +108,22 @@ Vega在多个GPU/NPU场景中,会启动dask scheduler、dask worker及训练 ```bash # 查询运行中的Vega主程序的进程ID -vega-kill -l +vega-process -s # 终止一个Vega主程序及相关进程 -vega-kill -p +vega-kill -s -p # 或者一次性的终止所有Vega相关进程 -vega-kill -a +vega-kill -s -a # 若主程序被非常正常关闭,还存在遗留的相关进程,可使用强制清理 -vega-kill -f +vega-kill -s -f ``` 在普通模式下,使用如下命令: ```bash -vega-kill -s -l -vega-kill -s -p -vega-kill -s -a -vega-kill -s -f +vega-process +vega-kill -p +vega-kill -a +vega-kill -f ``` ### 2.6 如何查询正在运行的vega程序 @@ -131,13 +131,13 @@ vega-kill -s -f 在安全模式下,可通过如下命令查询正在运行的Vega应用程序: ```bash -vega-process +vega-process -s ``` 在普通模式下,可通过如下命令查询: ```bash -vega-process -s +vega-process ``` ### 2.7 如何查询vega程序运行进度 @@ -145,13 +145,13 @@ vega-process -s 在安全模式下,可通过如下命令查询正在运行的Vega程序运行进度: ```bash -vega-progress -t -r +vega-progress -s -t -r ``` 在普通模式下,可通过如下命令查询: ```bash -vega-progress -s -t -r +vega-progress -t -r ``` ### 2.8 如何使用vega程序执行模型推理 diff --git a/docs/en/user/config_reference.md b/docs/en/user/config_reference.md index 7499143..912965a 100644 --- a/docs/en/user/config_reference.md +++ b/docs/en/user/config_reference.md @@ -156,7 +156,8 @@ fully_train: common: data_path: /cache/datasets/cifar10/ ``` -**Note**: HCCL supports multi-machine multi-card, Horovod currently only supports single machine multi-card. + +**Note**: When Horovod is used, you need to set up SSH Login Without Password between nodes in the cluster and ensure that the Python version and path of each host are the same. ## 3. NAS and HPO configuration items diff --git a/docs/en/user/faq.md b/docs/en/user/faq.md index 2863227..eb9c3d9 100644 --- a/docs/en/user/faq.md +++ b/docs/en/user/faq.md @@ -108,24 +108,24 @@ In safe mode, the Vega application can be terminated using the following command ```bash # Query the process ID of the running Vega main program. -vega-process +vega-process -s # Terminate a Vega main program and related processes. -vega-kill -p +vega-kill -s -p # Terminate a Vega main program and related processes. -vega-kill -t +vega-kill -s -t # Or stop all Vega-related processes at a time. -vega-kill -a +vega-kill -s -a # If the main program is shut down normally and there are remaining related processes, you can forcibly clear the process. -vega-kill -f +vega-kill -s -f ``` In common mode, run the following command:: ```bash -vega-kill -s -l -vega-kill -s -p -vega-kill -s -a -vega-kill -s -f +vega-process +vega-kill -p +vega-kill -a +vega-kill -f ``` ### 2.6 How Do I Query the Running Vega Program @@ -133,13 +133,13 @@ vega-kill -s -f In safe mode, run the following command to query the running Vega applications: ```bash -vega-process +vega-process -s ``` In common mode, you can run the following command to query: ```bash -vega-process -s +vega-process ``` ### 2.7 How Do I Query the Vega Program Running Progress @@ -147,13 +147,13 @@ vega-process -s In safe mode, you can run the following command to query the running progress of the Vega program: ```bash -vega-progress -t -r +vega-progress -s -t -r ``` In common mode, you can run the following command to query: ```bash -vega-progress -s -t -r +vega-progress -t -r ``` ### 2.8 How to Perform Model Inference Using the Vega Program diff --git a/vega/core/pipeline/horovod/run_horovod_train.sh b/vega/core/pipeline/horovod/run_horovod_train.sh index be1f3e9..f8f50d5 100644 --- a/vega/core/pipeline/horovod/run_horovod_train.sh +++ b/vega/core/pipeline/horovod/run_horovod_train.sh @@ -17,7 +17,7 @@ IFS="," eval 'server_list="${IP_ARRAY[*]}"' run_experiment() { local np=$1 shift - horovodrun -np $np -H $server_list $@ + horovodrun --start-timeout 300 -np $np -H $server_list $@ } run_experiment $nps $PYTHON_COMMAND $SCRIPT_PATH --cf_file $2 diff --git a/vega/core/pipeline/horovod_train_step.py b/vega/core/pipeline/horovod_train_step.py index a7aa6e3..5fda94b 100644 --- a/vega/core/pipeline/horovod_train_step.py +++ b/vega/core/pipeline/horovod_train_step.py @@ -56,6 +56,7 @@ def do(self): def _set_cluster_info(self): General.cluster.num_workers_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) + General.cluster.num_nodes = max(General.cluster.num_nodes, 1 + len(General.cluster.slaves)) General.cluster.num_workers = General.cluster.num_workers_per_node * General.cluster.num_nodes def train_model(self, trainer): diff --git a/vega/report/report_server.py b/vega/report/report_server.py index 375c9bb..10e31a9 100644 --- a/vega/report/report_server.py +++ b/vega/report/report_server.py @@ -129,18 +129,18 @@ def get_pareto_front_records(self, step_name=None, nums=None, selected_key=None, self.old_not_finished_workers = not_finished logging.info(f"waiting for the workers {str(not_finished)} to finish") if not records: - self.update_step_info({"step_name": step_name, "best_models": []}) + self.update_step_info(step_name=step_name, best_models=[]) return [] pareto = self.pareto_front(step_name, nums, records=records) if not pareto: - self.update_step_info(**{"step_name": step_name, "best_models": []}) + self.update_step_info(step_name=step_name, best_models=[]) return [] if choice is not None: - records = random.choice(pareto) - self.update_step_info(**{"step_name": step_name, "best_models": [record.worker_id for record in records]}) - return [records] + record = random.choice(pareto) + self.update_step_info(step_name=step_name, best_models=[record.worker_id]) + return [record] else: - self.update_step_info(**{"step_name": step_name, "best_models": [record.worker_id for record in pareto]}) + self.update_step_info(step_name=step_name, best_models=[record.worker_id for record in pareto]) return pareto @classmethod