From 51aa5e346b077a54f91a3a9cd4b0fc99acd29f1d Mon Sep 17 00:00:00 2001
From: xumingw <matthianxu@gmail.com>
Date: Thu, 27 Jun 2024 21:09:32 +0800
Subject: [PATCH] docs: add data preprocessing and training guide

---
 README.md              | 99 ++++++++++++++++++++++++++++++++++++++++++
 accelerate_config.yaml | 21 +++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 accelerate_config.yaml

diff --git a/README.md b/README.md
index b0cd3ce8..85cdc063 100644
--- a/README.md
+++ b/README.md
@@ -234,6 +234,105 @@ options:
                         face region
 ```
 
+## Training
+
+### Prepare Data for Training
+
+The training data, which utilizes some talking-face videos similar to the source images used for inference, also needs to meet the following requirements:
+
+1. It should be cropped into squares.
+2. The face should be the main focus, making up 50%-70% of the image.
+3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
+
+Organize your raw videos into the following directory structure:
+
+
+```text
+dataset_name/
+|-- videos/
+|   |-- 0001.mp4
+|   |-- 0002.mp4
+|   |-- 0003.mp4
+|   `-- 0004.mp4
+```
+
+You can use any `dataset_name`, but ensure the `videos` directory is named as shown above.
+
+Next, process the videos with the following commands:
+
+```bash
+python -m scripts.data_preprocess --input_dir dataset_name/videos --step 1
+python -m scripts.data_preprocess --input_dir dataset_name/videos --step 2
+```
+
+**Note:** Execute steps 1 and 2 sequentially as they perform different tasks. Step 1 converts videos into frames, extracts audio from each video, and generates the necessary masks. Step 2 generates face embeddings using InsightFace and audio embeddings using Wav2Vec, and requires a GPU. For parallel processing, use the `-p` and `-r` arguments. The `-p` argument specifies the total number of instances to launch, dividing the data into `p` parts. The `-r` argument specifies which part the current process should handle. You need to manually launch multiple instances with different values for `-r`.
+
+Generate the metadata JSON files with the following commands:
+
+```bash
+python scripts/extract_meta_info_stage1.py -r path/to/dataset -n dataset_name
+python scripts/extract_meta_info_stage2.py -r path/to/dataset -n dataset_name
+```
+
+Replace `path/to/dataset` with the path to the parent directory of `videos`, such as `dataset_name` in the example above. This will generate `dataset_name_stage1.json` and `dataset_name_stage2.json` in the `./data` directory.
+
+### Training
+
+Update the data meta path settings in the configuration YAML files, `configs/train/stage1.yaml` and `configs/train/stage2.yaml`:
+
+
+```yaml
+#stage1.yaml
+data:
+  meta_paths:
+    - ./data/dataset_name_stage1.json
+
+#stage2.yaml
+data:
+  meta_paths:
+    - ./data/dataset_name_stage2.json
+```
+
+Start training with the following command:
+
+```shell
+accelerate launch -m \
+  --config_file accelerate_config.yaml \
+  --machine_rank 0 \
+  --main_process_ip 0.0.0.0 \
+  --main_process_port 20055 \
+  --num_machines 1 \
+  --num_processes 8 \
+  scripts.train_stage1 --config ./configs/train/stage1.yaml
+```
+
+#### Accelerate Usage Explanation
+
+The `accelerate launch` command is used to start the training process with distributed settings.
+
+```shell
+accelerate launch [arguments] {training_script} --{training_script-argument-1} --{training_script-argument-2} ...
+```
+
+**Arguments for Accelerate:**
+
+- `-m, --module`: Interpret the launch script as a Python module.
+- `--config_file`: Configuration file for Hugging Face Accelerate.
+- `--machine_rank`: Rank of the current machine in a multi-node setup.
+- `--main_process_ip`: IP address of the master node.
+- `--main_process_port`: Port of the master node.
+- `--num_machines`: Total number of nodes participating in the training.
+- `--num_processes`: Total number of processes for training, matching the total number of GPUs across all machines.
+
+**Arguments for Training:**
+
+- `{training_script}`: The training script, such as `scripts.train_stage1` or `scripts.train_stage2`.
+- `--{training_script-argument-1}`: Arguments specific to the training script. Our training scripts accept one argument, `--config`, to specify the training configuration file.
+
+For multi-node training, you need to manually run the command with different `machine_rank` on each node separately.
+
+For more settings, refer to the [Accelerate documentation](https://huggingface.co/docs/accelerate/en/index).
+
 ## 📅️ Roadmap
 
 | Status | Milestone                                                                                             |    ETA     |
diff --git a/accelerate_config.yaml b/accelerate_config.yaml
new file mode 100644
index 00000000..6fa766f1
--- /dev/null
+++ b/accelerate_config.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: true
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: "no"
+main_training_function: main
+mixed_precision: "fp16"
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false