From 51aa5e346b077a54f91a3a9cd4b0fc99acd29f1d Mon Sep 17 00:00:00 2001 From: xumingw Date: Thu, 27 Jun 2024 21:09:32 +0800 Subject: [PATCH] docs: add data preprocessing and training guide --- README.md | 99 ++++++++++++++++++++++++++++++++++++++++++ accelerate_config.yaml | 21 +++++++++ 2 files changed, 120 insertions(+) create mode 100644 accelerate_config.yaml diff --git a/README.md b/README.md index b0cd3ce8..85cdc063 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,105 @@ options: face region ``` +## Training + +### Prepare Data for Training + +The training data, which utilizes some talking-face videos similar to the source images used for inference, also needs to meet the following requirements: + +1. It should be cropped into squares. +2. The face should be the main focus, making up 50%-70% of the image. +3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles). + +Organize your raw videos into the following directory structure: + + +```text +dataset_name/ +|-- videos/ +| |-- 0001.mp4 +| |-- 0002.mp4 +| |-- 0003.mp4 +| `-- 0004.mp4 +``` + +You can use any `dataset_name`, but ensure the `videos` directory is named as shown above. + +Next, process the videos with the following commands: + +```bash +python -m scripts.data_preprocess --input_dir dataset_name/videos --step 1 +python -m scripts.data_preprocess --input_dir dataset_name/videos --step 2 +``` + +**Note:** Execute steps 1 and 2 sequentially as they perform different tasks. Step 1 converts videos into frames, extracts audio from each video, and generates the necessary masks. Step 2 generates face embeddings using InsightFace and audio embeddings using Wav2Vec, and requires a GPU. For parallel processing, use the `-p` and `-r` arguments. The `-p` argument specifies the total number of instances to launch, dividing the data into `p` parts. The `-r` argument specifies which part the current process should handle. You need to manually launch multiple instances with different values for `-r`. + +Generate the metadata JSON files with the following commands: + +```bash +python scripts/extract_meta_info_stage1.py -r path/to/dataset -n dataset_name +python scripts/extract_meta_info_stage2.py -r path/to/dataset -n dataset_name +``` + +Replace `path/to/dataset` with the path to the parent directory of `videos`, such as `dataset_name` in the example above. This will generate `dataset_name_stage1.json` and `dataset_name_stage2.json` in the `./data` directory. + +### Training + +Update the data meta path settings in the configuration YAML files, `configs/train/stage1.yaml` and `configs/train/stage2.yaml`: + + +```yaml +#stage1.yaml +data: + meta_paths: + - ./data/dataset_name_stage1.json + +#stage2.yaml +data: + meta_paths: + - ./data/dataset_name_stage2.json +``` + +Start training with the following command: + +```shell +accelerate launch -m \ + --config_file accelerate_config.yaml \ + --machine_rank 0 \ + --main_process_ip 0.0.0.0 \ + --main_process_port 20055 \ + --num_machines 1 \ + --num_processes 8 \ + scripts.train_stage1 --config ./configs/train/stage1.yaml +``` + +#### Accelerate Usage Explanation + +The `accelerate launch` command is used to start the training process with distributed settings. + +```shell +accelerate launch [arguments] {training_script} --{training_script-argument-1} --{training_script-argument-2} ... +``` + +**Arguments for Accelerate:** + +- `-m, --module`: Interpret the launch script as a Python module. +- `--config_file`: Configuration file for Hugging Face Accelerate. +- `--machine_rank`: Rank of the current machine in a multi-node setup. +- `--main_process_ip`: IP address of the master node. +- `--main_process_port`: Port of the master node. +- `--num_machines`: Total number of nodes participating in the training. +- `--num_processes`: Total number of processes for training, matching the total number of GPUs across all machines. + +**Arguments for Training:** + +- `{training_script}`: The training script, such as `scripts.train_stage1` or `scripts.train_stage2`. +- `--{training_script-argument-1}`: Arguments specific to the training script. Our training scripts accept one argument, `--config`, to specify the training configuration file. + +For multi-node training, you need to manually run the command with different `machine_rank` on each node separately. + +For more settings, refer to the [Accelerate documentation](https://huggingface.co/docs/accelerate/en/index). + ## 📅️ Roadmap | Status | Milestone | ETA | diff --git a/accelerate_config.yaml b/accelerate_config.yaml new file mode 100644 index 00000000..6fa766f1 --- /dev/null +++ b/accelerate_config.yaml @@ -0,0 +1,21 @@ +compute_environment: LOCAL_MACHINE +debug: true +deepspeed_config: + deepspeed_multinode_launcher: standard + gradient_accumulation_steps: 1 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: false + zero_stage: 2 +distributed_type: DEEPSPEED +downcast_bf16: "no" +main_training_function: main +mixed_precision: "fp16" +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false