Merge pull request #67 from ddlBoJack/main

sync
X-LANCE · May 7, 2024 · 211c2c2 · 211c2c2
2 parents 54c69a9 + e2e5437
commit 211c2c2
Show file tree

Hide file tree

Showing 33 changed files with 6,125 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,9 @@ wandb/
 log/
 *.log
 outputs/
-data/
+data/
+
+.gitignore
+examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
+examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
+examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
diff --git a/examples/asr_librispeech/README.md b/examples/asr_librispeech/README.md
@@ -5,7 +5,7 @@ We only train the linear projector in this recipe.
 Encoder | Projector | LLM | test-clean | test-other
 |---|---|---|---|---
 [WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1cLNuMR05oXxKj8M_Z3yAZ5JHJ06ybIHp/view?usp=sharing)(~18.88M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 2.28 | 4.78
-
+[hubert_xtralarge_ll60k_finetune_ls960](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt) | [Linear](https://drive.google.com/file/d/1Np7EjMYSZCl7M6Q92pt_MvOSSX6ggJPA/view?usp=drive_link)(~21.50M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 1.84 | 3.39 
 
 ## Data preparation
 You need to prepare the data jsonl in this format.

diff --git a/examples/asr_librispeech/asr_config.py b/examples/asr_librispeech/asr_config.py
@@ -15,7 +15,10 @@ class ModelConfig:
     encoder_projector_ds_rate: int = 5
     modal: str = "audio"
     normalize: Optional[bool] = field(default=False, metadata={
-        "help": "whether inpit is normalized, used for models such as wavlm"
+        "help": "whether input is normalized, used for models such as wavlm"
+    })
+    encoder_type: str = field(default="finetune", metadata={
+        "help": "whether model is only pretrained or finetuned, used for models such as hubert"
     })
 
 @dataclass
@@ -97,7 +100,7 @@ class DataConfig:
         "help": "80 for whisper large v1 and v2, 128 for v3"
     })
     normalize: Optional[bool] = field(default=False, metadata={
-        "help": "whether inpit is normalized, used for models such as wavlm"
+        "help": "whether input is normalized, used for models such as wavlm"
     })
 
 @dataclass

diff --git a/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#export PYTHONPATH=/root/whisper:$PYTHONPATH
+export PYTHONPATH=/root/fairseq:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+export TOKENIZERS_PARALLELISM=false
+# export CUDA_LAUNCH_BLOCKING=1
+
+run_dir=/root/SLAM-LLM
+cd $run_dir
+code_dir=examples/asr_librispeech
+
+speech_encoder_path=/nfs/yangguanrou.ygr/ckpts/hubert_ckpt/hubert_xtralarge_ll60k_finetune_ls960.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
+
+output_dir=/nfs/yangguanrou.ygr/experiments_hubert/vicuna-7b-v1.5-hubert_xtralarge_ll60k_finetune_ls960
+ckpt_path=$output_dir/asr_epoch_1_step_1000
+split=librispeech_test_clean
+val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl
+decode_log=$ckpt_path/decode_${split}_beam4
+
+# -m debugpy --listen 5678 --wait-for-client
+python $code_dir/inference_asr_batch.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        hydra.run.dir=$ckpt_path \
+        ++model_config.llm_name="vicuna-7b-v1.5" \
+        ++model_config.llm_path=$llm_path \
+        ++model_config.llm_dim=4096 \
+        ++model_config.encoder_name=hubert \
+        ++model_config.normalize=true \
+        ++dataset_config.normalize=true \
+        ++model_config.encoder_projector_ds_rate=5 \
+        ++model_config.encoder_path=$speech_encoder_path \
+        ++model_config.encoder_dim=1280 \
+        ++model_config.encoder_type=finetune \
+        ++model_config.encoder_projector=linear \
+        ++dataset_config.dataset=speech_dataset \
+        ++dataset_config.val_data_path=$val_data_path \
+        ++dataset_config.input_type=raw \
+        ++dataset_config.inference_mode=true \
+        ++dataset_config.prompt="Transcribe speech to text. " \
+        ++train_config.model_name=asr \
+        ++train_config.freeze_encoder=true \
+        ++train_config.freeze_llm=true \
+        ++train_config.batching_strategy=custom \
+        ++train_config.num_epochs=1 \
+        ++train_config.val_batch_size=1 \
+        ++train_config.num_workers_dataloader=0 \
+        ++train_config.output_dir=$output_dir \
+        ++decode_log=$decode_log \
+        ++ckpt_path=$ckpt_path/model.pt \
+        # ++peft_ckpt=$ckpt_path \
+        # ++train_config.use_peft=true \
+        # ++train_config.peft_config.r=32 \
+        # ++dataset_config.normalize=true \
+        # ++model_config.encoder_projector=q-former \
+        # ++dataset_config.fix_length_audio=64 \
+
+python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
+python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
+python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/asr_librispeech/scripts/finetune_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/finetune_hubert_xtralarge_linear_vicuna_7b.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# export PYTHONPATH=/root/whisper:$PYTHONPATH
+export PYTHONPATH=/root/fairseq:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=2,3
+export TOKENIZERS_PARALLELISM=false
+# export CUDA_LAUNCH_BLOCKING=1
+export OMP_NUM_THREADS=1
+
+# debug setting for multiple gpus
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+# export TORCH_DISTRIBUTED_DEBUG=INFO
+
+run_dir=/root/SLAM-LLM
+cd $run_dir
+code_dir=examples/asr_librispeech
+
+speech_encoder_path=/nfs/yangguanrou.ygr/ckpts/hubert_ckpt/hubert_xtralarge_ll60k_finetune_ls960.pt
+llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5
+train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl
+val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl
+
+output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-hubert-xtralarge-$(date +"%Y%m%d")
+
+hydra_args="
+hydra.run.dir=$output_dir \
+++model_config.llm_name=vicuna-7b-v1.5 \
+++model_config.llm_path=$llm_path \
+++model_config.llm_dim=4096 \
+++model_config.encoder_name=hubert \
+++model_config.normalize=true \
+++dataset_config.normalize=true \
+++model_config.encoder_projector_ds_rate=5 \
+++model_config.encoder_path=$speech_encoder_path \
+++model_config.encoder_dim=1280 \
+++model_config.encoder_type=finetune \
+++model_config.encoder_projector=linear \
+++dataset_config.dataset=speech_dataset \
+++dataset_config.train_data_path=$train_data_path \
+++dataset_config.val_data_path=$val_data_path \
+++dataset_config.input_type=raw \
+++train_config.model_name=asr \
+++train_config.num_epochs=3 \
+++train_config.freeze_encoder=true \
+++train_config.freeze_llm=true \
+++train_config.batching_strategy=custom \
+++train_config.warmup_steps=1000 \
+++train_config.total_steps=100000 \
+++train_config.lr=1e-4 \
+++train_config.validation_interval=2000 \
+++train_config.batch_size_training=6 \
+++train_config.val_batch_size=6 \
+++train_config.num_workers_dataloader=0 \
+++train_config.output_dir=$output_dir \
+++metric=acc \
+"
+
+# -m debugpy --listen 5678 --wait-for-client
+if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
+    python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        $hydra_args
+else
+    torchrun \
+        --nnodes 1 \
+        --nproc_per_node 2 \
+        --master_port=29503 \
+        $code_dir/finetune_asr.py \
+        --config-path "conf" \
+        --config-name "prompt.yaml" \
+        ++train_config.enable_fsdp=false \
+        ++train_config.enable_ddp=true \
+        ++train_config.use_fp16=true \
+        $hydra_args
+fi
diff --git a/examples/vsr_LRS3/README.md b/examples/vsr_LRS3/README.md
@@ -0,0 +1,30 @@
+# VSR_LRS3
+
+## Performance and checkpoints
+We only train the linear projector in this recipe.
+Encoder | Projector | LLM | test 
+|---|---|---|---|
+[AV-HuBERT Large + Self-Training](https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/vsr/self_large_vox_433h.pt) | [Linear](https://drive.google.com/file/d/1DNfJgyeLx9xet4DT5xZXyx8ZOcNoawL8/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 29.47 
+
+
+## Data preparation
+Follow the steps in [preparation](https://github.com/facebookresearch/av_hubert/tree/main/avhubert/preparation) of av_hubert to pre-process LRS3 dataset
+
+## Environment
+Use the specific fairseq version of [av_hubert](https://github.com/facebookresearch/av_hubert), which is compatible with hydra-core versions below 1.0.7 and omegaconf versions below 2.0.6.
+
+
+## Decode with checkpoints
+```
+bash decode_avhubert_vo_vicuna_7b.sh
+```
+Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. 
+
+## Train a new model
+
+### Use the visual part of AV-HuBERT Large as the encoder
+```
+bash finetune_avhubert_vo_vicuna_7b.sh
+```
+
+
diff --git a/examples/vsr_LRS3/conf/ds_config.json b/examples/vsr_LRS3/conf/ds_config.json
@@ -0,0 +1,19 @@
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu"
+        }
+    }
+}
diff --git a/examples/vsr_LRS3/conf/prompt.yaml b/examples/vsr_LRS3/conf/prompt.yaml
@@ -0,0 +1,3 @@
+dataset_config:
+    # we put prompt here, because the hydra override in shell script only support a small subset of chars
+    prompt: "Transcribe the silent speech in this video to text by lip-reading the speaker's clear and visible lip movements."
diff --git a/examples/vsr_LRS3/finetune_vsr.py b/examples/vsr_LRS3/finetune_vsr.py
@@ -0,0 +1,45 @@
+from slam_llm.pipeline.finetune import main as train
+
+import hydra
+import logging
+from dataclasses import dataclass, field
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from vsr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig
+
+@dataclass
+class RunConfig:
+    dataset_config: DataConfig = field(default_factory=DataConfig)
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+    train_config: TrainConfig = field(default_factory=TrainConfig)
+    log_config: LogConfig = field(default_factory=LogConfig)
+    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
+    debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
+    metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+
+@hydra.main(config_name=None)
+def main_hydra(cfg: DictConfig):
+    run_config = RunConfig()
+    cfg = OmegaConf.merge(run_config, cfg)
+    def to_plain_list(cfg_item):
+        if isinstance(cfg_item, ListConfig):
+            return OmegaConf.to_container(cfg_item, resolve=True)
+        elif isinstance(cfg_item, DictConfig):
+            return {k: to_plain_list(v) for k, v in cfg_item.items()}
+        else:
+            return cfg_item
+
+    # kwargs = to_plain_list(cfg)
+    kwargs = cfg
+    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+
+    logging.basicConfig(level=log_level)
+
+    if kwargs.get("debug", False):
+        import pdb;
+        pdb.set_trace()
+
+    train(kwargs)
+
+
+if __name__ == "__main__":
+    main_hydra()
diff --git a/examples/vsr_LRS3/inference_vsr_batch.py b/examples/vsr_LRS3/inference_vsr_batch.py
@@ -0,0 +1,53 @@
+from slam_llm.pipeline.inference_batch import main as inference
+
+import hydra
+import logging
+from dataclasses import dataclass, field
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from typing import Optional
+from vsr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig
+
+
+@dataclass
+class RunConfig:
+    dataset_config: DataConfig = field(default_factory=DataConfig)
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+    train_config: TrainConfig = field(default_factory=TrainConfig)
+    log_config: LogConfig = field(default_factory=LogConfig)
+    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
+    debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
+    metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+    decode_log: str = field(
+        default="output/decode_log",
+        metadata={"help": "The prefix for the decode output"},
+    )
+    ckpt_path: str = field(
+        default="output/model.pt", metadata={"help": "The path to projector checkpoint"}
+    )
+    peft_ckpt: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The path to peft checkpoint, should be a directory including adapter_config.json"
+        },
+    )
+
+
+@hydra.main(config_name=None)
+def main_hydra(cfg: DictConfig):
+    run_config = RunConfig()
+    cfg = OmegaConf.merge(run_config, cfg)
+    # kwargs = to_plain_list(cfg)
+    log_level = getattr(logging, cfg.get("log_level", "INFO").upper())
+
+    logging.basicConfig(level=log_level)
+
+    if cfg.get("debug", False):
+        import pdb
+
+        pdb.set_trace()
+
+    inference(cfg)
+
+
+if __name__ == "__main__":
+    main_hydra()