Merge pull request #233 from huawei-noah/zjj_bug_fix_report_and_horovod

bug fix: report, horovod, docs
huawei-noah · Apr 12, 2022 · 3fa253a · 3fa253a
2 parents 78ec4af + 01b9862
commit 3fa253a
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 34 deletions.
diff --git a/docs/cn/user/config_reference.md b/docs/cn/user/config_reference.md
@@ -158,7 +158,8 @@ fully_train:
         common:
             data_path: /cache/datasets/cifar10/
 ```
-**注**: HCCL支持多机多卡，Horovod目前只支持单机多卡。
+
+**注**: 使用Horovod时，集群里的各节点间都需要配置SSH互信，且确保各主机的python版本和路径一致。
 
 ## 3. NAS和HPO配置项
 

diff --git a/docs/cn/user/faq.md b/docs/cn/user/faq.md
@@ -108,50 +108,50 @@ Vega在多个GPU/NPU场景中，会启动dask scheduler、dask worker及训练
 
 ```bash
 # 查询运行中的Vega主程序的进程ID
-vega-kill -l
+vega-process -s
 # 终止一个Vega主程序及相关进程
-vega-kill -p <pid>
+vega-kill -s -p <pid>
 # 或者一次性的终止所有Vega相关进程
-vega-kill -a
+vega-kill -s -a
 # 若主程序被非常正常关闭，还存在遗留的相关进程，可使用强制清理
-vega-kill -f
+vega-kill -s -f
 ```
 
 在普通模式下，使用如下命令：
 
 ```bash
-vega-kill -s -l
-vega-kill -s -p <pid>
-vega-kill -s -a
-vega-kill -s -f
+vega-process
+vega-kill -p <pid>
+vega-kill -a
+vega-kill -f
 ```
 
 ### 2.6 如何查询正在运行的vega程序
 
 在安全模式下，可通过如下命令查询正在运行的Vega应用程序：
 
 ```bash
-vega-process
+vega-process -s
 ```
 
 在普通模式下，可通过如下命令查询：
 
 ```bash
-vega-process -s
+vega-process
 ```
 
 ### 2.7 如何查询vega程序运行进度
 
 在安全模式下，可通过如下命令查询正在运行的Vega程序运行进度：
 
 ```bash
-vega-progress -t <Task ID> -r <Task Root Path>
+vega-progress -s -t <Task ID> -r <Task Root Path>
 ```
 
 在普通模式下，可通过如下命令查询：
 
 ```bash
-vega-progress -s -t <Task ID> -r <Task Root Path>
+vega-progress -t <Task ID> -r <Task Root Path>
 ```
 
 ### 2.8 如何使用vega程序执行模型推理

diff --git a/docs/en/user/config_reference.md b/docs/en/user/config_reference.md
@@ -156,7 +156,8 @@ fully_train:
         common:
             data_path: /cache/datasets/cifar10/
 ```
-**Note**: HCCL supports multi-machine multi-card, Horovod currently only supports single machine multi-card.
+
+**Note**: When Horovod is used, you need to set up SSH Login Without Password between nodes in the cluster and ensure that the Python version and path of each host are the same.
 
 ## 3. NAS and HPO configuration items
 

diff --git a/docs/en/user/faq.md b/docs/en/user/faq.md
@@ -108,52 +108,52 @@ In safe mode, the Vega application can be terminated using the following command
 
 ```bash
 # Query the process ID of the running Vega main program.
-vega-process
+vega-process -s
 # Terminate a Vega main program and related processes.
-vega-kill -p <pid>
+vega-kill -s -p <pid>
 # Terminate a Vega main program and related processes.
-vega-kill -t <task id>
+vega-kill -s -t <task id>
 # Or stop all Vega-related processes at a time.
-vega-kill -a
+vega-kill -s -a
 # If the main program is shut down normally and there are remaining related processes, you can forcibly clear the process.
-vega-kill -f
+vega-kill -s -f
 ```
 
 In common mode, run the following command:：
 
 ```bash
-vega-kill -s -l
-vega-kill -s -p <pid>
-vega-kill -s -a
-vega-kill -s -f
+vega-process
+vega-kill -p <pid>
+vega-kill -a
+vega-kill -f
 ```
 
 ### 2.6 How Do I Query the Running Vega Program
 
 In safe mode, run the following command to query the running Vega applications:
 
 ```bash
-vega-process
+vega-process -s
 ```
 
 In common mode, you can run the following command to query:
 
 ```bash
-vega-process -s
+vega-process
 ```
 
 ### 2.7 How Do I Query the Vega Program Running Progress
 
 In safe mode, you can run the following command to query the running progress of the Vega program:
 
 ```bash
-vega-progress -t <Task ID> -r <Task Root Path>
+vega-progress -s -t <Task ID> -r <Task Root Path>
 ```
 
 In common mode, you can run the following command to query:
 
 ```bash
-vega-progress -s -t <Task ID> -r <Task Root Path>
+vega-progress -t <Task ID> -r <Task Root Path>
 ```
 
 ### 2.8 How to Perform Model Inference Using the Vega Program

diff --git a/vega/core/pipeline/horovod/run_horovod_train.sh b/vega/core/pipeline/horovod/run_horovod_train.sh
@@ -17,7 +17,7 @@ IFS="," eval 'server_list="${IP_ARRAY[*]}"'
 run_experiment() {
     local np=$1
     shift
-    horovodrun -np $np -H $server_list $@
+    horovodrun --start-timeout 300 -np $np -H $server_list $@
 }
 
 run_experiment $nps $PYTHON_COMMAND $SCRIPT_PATH --cf_file $2
diff --git a/vega/core/pipeline/horovod_train_step.py b/vega/core/pipeline/horovod_train_step.py
@@ -56,6 +56,7 @@ def do(self):
 
     def _set_cluster_info(self):
         General.cluster.num_workers_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
+        General.cluster.num_nodes = max(General.cluster.num_nodes, 1 + len(General.cluster.slaves))
         General.cluster.num_workers = General.cluster.num_workers_per_node * General.cluster.num_nodes
 
     def train_model(self, trainer):

diff --git a/vega/report/report_server.py b/vega/report/report_server.py
@@ -129,18 +129,18 @@ def get_pareto_front_records(self, step_name=None, nums=None, selected_key=None,
                 self.old_not_finished_workers = not_finished
                 logging.info(f"waiting for the workers {str(not_finished)} to finish")
         if not records:
-            self.update_step_info({"step_name": step_name, "best_models": []})
+            self.update_step_info(step_name=step_name, best_models=[])
             return []
         pareto = self.pareto_front(step_name, nums, records=records)
         if not pareto:
-            self.update_step_info(**{"step_name": step_name, "best_models": []})
+            self.update_step_info(step_name=step_name, best_models=[])
             return []
         if choice is not None:
-            records = random.choice(pareto)
-            self.update_step_info(**{"step_name": step_name, "best_models": [record.worker_id for record in records]})
-            return [records]
+            record = random.choice(pareto)
+            self.update_step_info(step_name=step_name, best_models=[record.worker_id])
+            return [record]
         else:
-            self.update_step_info(**{"step_name": step_name, "best_models": [record.worker_id for record in pareto]})
+            self.update_step_info(step_name=step_name, best_models=[record.worker_id for record in pareto])
             return pareto
 
     @classmethod