update musiccaps and some scripts

X-LANCE · May 20, 2024 · f3e6b57 · f3e6b57
1 parent 3b79fb8
commit f3e6b57
Show file tree

Hide file tree

Showing 22 changed files with 127 additions and 209 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
@@ -6,7 +6,7 @@ body:
     attributes:
       value: >
         #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
-        existing and past issues](https://github.com/facebookresearch/llama-recipes/issues), the [FAQ](https://github.com/facebookresearch/llama-recipes/blob/main/docs/FAQ.md) 
+        existing and past issues](https://github.com/ddlBoJack/SLAM-LLM/issues).
 
   - type: textarea
     id: system-info

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -1,5 +1,5 @@
 name: 🚀 Feature request
-description: Submit a proposal/request for a new llama-recipes feature
+description: Submit a proposal/request for a new slam-llm feature
 
 body:
 - type: textarea

diff --git a/.gitignore b/.gitignore
@@ -9,9 +9,4 @@ wandb/
 log/
 *.log
 outputs/
-data/
-
-.gitignore
-examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
-examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
-examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
+data/
diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ developers to train custom multimodal large language model (MLLM), focusing on <
 5. [Acknowledge](#acknowledge)
 
 # News
+- [Update May. 20, 2024] Recipes for [music caption (MC)](examples/mc_musiccaps/README.md) has been supported. 
 - [Update May. 8, 2024] Recipes for [visual speech recognition (VSR)](examples/vsr_LRS3/README.md) has been supported. 
 - [Update May. 4, 2024] Recipes for [zero-shot text-to-speech (TTS)](examples/vallex/README.md) has been supported. 
 - [Update Apr. 28, 2024] Recipes for [automated audio captioning (AAC)](examples/aac_audiocaps/README.md) has been supported. 
@@ -66,6 +67,8 @@ We provide reference implementations of various LLM-based speech, audio, and mus
     - [Visual Speech Recognition (VSR)](examples/vsr_LRS3/README.md)
 - **Audio Task**
     - [Automated Audio Captioning (AAC)](examples/aac_audiocaps/README.md)
+- **Music Task**
+    - [Music Caption (MC)](examples/mc_musiccaps/README.md)
 
 ## Configuration Priority
 We provide hierarchical configuration inheritance relationships as follows:

diff --git a/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh
@@ -55,7 +55,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh
@@ -52,7 +52,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh
@@ -51,7 +51,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/music_caption/README.md → examples/mc_musiccaps/README.md b/examples/music_caption/README.md → examples/mc_musiccaps/README.md
@@ -1,9 +1,9 @@
-# Music Caption
+# MC_MusicCaps
 
 ## Performance and checkpoints
 Here is a recipe for music captioning, using MusicFM as encoder. We only train the linear projector. For more about MusicFM and its checkpoints, please refer to [this repository](https://github.com/minzwon/musicfm).
 
-The following results are obtained by training on the LP-MusicCaps-MC training set and evaluating on the LP-MusicCaps-MC test set.
+The following results are obtained by training on the [LP-MusicCaps-MC](https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC) training set and evaluating on the [LP-MusicCaps-MC](https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC) test set.
 Encoder | Projector | LLM | BLEU-1 | METEOR | SPICE | SPIDER 
 |---|---|---|---|---|---|---
 [MusicFM(pretrained with MSD)](https://huggingface.co/minzwon/MusicFM/resolve/main/pretrained_msd.pt) | [Linear](https://drive.google.com/file/d/1-9pob6QvJRoq5Dy-LZbiDfF6Q7QRO8Au/view?usp=sharing)(~18.88M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 25.6 | 10.0 | 8.7 | 6.9

diff --git a/examples/music_caption/conf/ds_config.json → examples/mc_musiccaps/conf/ds_config.json b/examples/music_caption/conf/ds_config.json → examples/mc_musiccaps/conf/ds_config.json
diff --git a/examples/music_caption/conf/prompt.yaml → examples/mc_musiccaps/conf/prompt.yaml b/examples/music_caption/conf/prompt.yaml → examples/mc_musiccaps/conf/prompt.yaml
diff --git a/...s/music_caption/deepspeed_finetune_mir.py → ...es/mc_musiccaps/deepspeed_finetune_mir.py b/...s/music_caption/deepspeed_finetune_mir.py → ...es/mc_musiccaps/deepspeed_finetune_mir.py
diff --git a/examples/music_caption/finetune_mir.py → examples/mc_musiccaps/finetune_mir.py b/examples/music_caption/finetune_mir.py → examples/mc_musiccaps/finetune_mir.py
diff --git a/...ples/music_caption/inference_mir_batch.py → examples/mc_musiccaps/inference_mir_batch.py b/...ples/music_caption/inference_mir_batch.py → examples/mc_musiccaps/inference_mir_batch.py
diff --git a/examples/music_caption/mir_config.py → examples/mc_musiccaps/mir_config.py b/examples/music_caption/mir_config.py → examples/mc_musiccaps/mir_config.py
diff --git a/examples/mc_musiccaps/model/slam_model_mir.py b/examples/mc_musiccaps/model/slam_model_mir.py
@@ -0,0 +1,78 @@
+import torch
+import os
+import logging
+from slam_llm.models.slam_model import (
+    slam_model,
+    setup_tokenizer,
+    setup_encoder,
+    setup_encoder_projector,
+    setup_llm,
+)
+from slam_llm.utils.train_utils import print_model_size
+
+logger = logging.getLogger(__name__)
+
+def model_factory(train_config, model_config, **kwargs):
+    # return necessary components for training
+    tokenizer = setup_tokenizer(train_config, model_config, **kwargs)
+
+    encoder = setup_encoder(train_config, model_config, **kwargs)
+
+    # llm
+    llm = setup_llm(train_config, model_config, **kwargs)
+
+    # projector
+    encoder_projector = setup_encoder_projector(
+        train_config, model_config, **kwargs
+    )
+    model = slam_model_mir(
+        encoder,
+        llm,
+        encoder_projector,
+        tokenizer,
+        train_config,
+        model_config,
+        **kwargs,
+    )
+
+    ckpt_path = kwargs.get(
+        "ckpt_path", None
+    )  # FIX(MZY): load model ckpt(mainly projector, related to model_checkpointing/checkpoint_handler.py: save_model_checkpoint_peft)
+
+    if ckpt_path is not None:
+        logger.info("loading other parts from: {}".format(ckpt_path))
+        ckpt_dict = torch.load(ckpt_path, map_location="cpu")
+        model.load_state_dict(ckpt_dict, strict=False)
+
+    print_model_size(
+        model,
+        train_config,
+        (
+            int(os.environ["RANK"])
+            if train_config.enable_fsdp or train_config.enable_ddp
+            else 0
+        ),
+    )
+    return model, tokenizer
+
+
+class slam_model_mir(slam_model):
+    def __init__(
+        self,
+        encoder,
+        llm,
+        encoder_projector,
+        tokenizer,
+        train_config,
+        model_config,
+        **kwargs,
+    ):
+        super().__init__(
+            encoder,
+            llm,
+            encoder_projector,
+            tokenizer,
+            train_config,
+            model_config,
+            **kwargs,
+        )
diff --git a/...ts/decode_musicfm_linear_vicuna_7b_10s.sh → ...ts/decode_musicfm_linear_vicuna_7b_10s.sh b/...ts/decode_musicfm_linear_vicuna_7b_10s.sh → ...ts/decode_musicfm_linear_vicuna_7b_10s.sh
diff --git a/.../finetune_musicfm_linear_vicuna_7b_10s.sh → .../finetune_musicfm_linear_vicuna_7b_10s.sh b/.../finetune_musicfm_linear_vicuna_7b_10s.sh → .../finetune_musicfm_linear_vicuna_7b_10s.sh
diff --git a/examples/music_caption/model/slam_model_mir.py b/examples/music_caption/model/slam_model_mir.py
diff --git a/examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b.sh b/examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b.sh
@@ -49,7 +49,3 @@ python $code_dir/inference_vsr_batch.py \
         # +dataset_config.normalize=true \
         # +model_config.encoder_projector=q-former \
         # +dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,12 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "slam-llm"
 version = "0.0.1"
-authors = [
-  { name="Hamid Shojanazeri", email="[email protected]" },
-  { name="Matthias Reso", email="[email protected]" },
-  { name="Geeta Chauhan", email="[email protected]" },
-]
-description = "To be done"
+description = "SLAM-LLM is a deep learning toolkit that allows researchers and developers to train custom multimodal large language model (MLLM), focusing on Speech, Language, Audio, Music processing. We provide detailed recipes for training and high-performance checkpoints for inference."
 readme = "README.md"
 requires-python = ">=3.8"
 classifiers = [
@@ -26,8 +21,8 @@ tests = ["pytest-mock"]
 auditnlg = ["auditnlg"]
 
 [project.urls]
-"Homepage" = "https://github.com/facebookresearch/llama-recipes/"
-"Bug Tracker" = "https://github.com/facebookresearch/llama-recipes/issues"
+"Homepage" = "https://github.com/ddlBoJack/SLAM-LLM"
+"Bug Tracker" = "https://github.com/ddlBoJack/SLAM-LLM/issues"
 
 [tool.hatch.build]
 exclude = [

diff --git a/scripts/compute_wer.sh b/scripts/compute_wer.sh