Merge pull request #42 from alexandrainst/chore/config-structure

Chore/config structure
alexandrainst · Oct 26, 2023 · 272d401 · 272d401
2 parents 12b6829 + ed6f96a
commit 272d401
Show file tree

Hide file tree

Showing 28 changed files with 1,716 additions and 1,778 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -17,7 +17,7 @@ jobs:
     strategy:
         matrix:
             os: [windows-latest, macos-latest, ubuntu-latest]
-            python-version: ["3.10", "3.11"]
+            python-version: ["3.11"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -26,7 +26,7 @@ jobs:
         uses: FedericoCarboni/setup-ffmpeg@v2
 
       - name: Install Poetry
-        run: pipx install poetry==1.4.0
+        run: pip3 install poetry==1.5.1
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -39,9 +39,6 @@ jobs:
           poetry env use "${{ matrix.python-version }}"
           poetry install --no-interaction --no-cache
 
-      - name: Fix PyTorch bug
-        run: poetry add torch==2.0.0
-
       - name: Test with pytest
         run: poetry run pytest
         env:

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -17,8 +17,11 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      - name: Install ffmpeg
+        uses: FedericoCarboni/setup-ffmpeg@v2
+
       - name: Install Poetry
-        run: pipx install poetry==1.4.0
+        run: pip3 install poetry==1.5.1
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -27,12 +30,12 @@ jobs:
           cache: "poetry"
 
       - name: Install Dependencies
-        run: poetry install
+        run: |
+          poetry env use "${{ matrix.python-version }}"
+          poetry install --no-interaction --no-cache
 
       - name: Build documentation
-        run: |
-          poetry env use "3.11"
-          poetry run pdoc --docformat google src/coral_models -o docs
+        run: poetry run pdoc --docformat google src/coral_models -o docs
 
       - name: Compress documentation
         run: tar --directory docs/ -hcf artifact.tar .

diff --git a/.gitignore b/.gitignore
@@ -89,6 +89,12 @@ target/
 # pytest cache
 .pytest_cache/
 
+# Linting cache
+.ruff_cache/
+
+# Python cache
+**/__pycache__
+
 # Hydra logs
 outputs/
 multirun/

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ ______________________________________________________________________
 [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://alexandrainst.github.io/coral_models/coral_models.html)
 [![License](https://img.shields.io/github/license/alexandrainst/coral_models)](https://github.com/alexandrainst/coral_models/blob/main/LICENSE)
 [![LastCommit](https://img.shields.io/github/last-commit/alexandrainst/coral_models)](https://github.com/alexandrainst/coral_models/commits/main)
-[![Code Coverage](https://img.shields.io/badge/Coverage-60%25-yellow.svg)](https://github.com/alexandrainst/coral_models/tree/main/tests)
+[![Code Coverage](https://img.shields.io/badge/Coverage-53%25-orange.svg)](https://github.com/alexandrainst/coral_models/tree/main/tests)
 
 
 Developers:
@@ -54,7 +54,6 @@ publishing the code as a package and more.
 ## Project structure
 ```
 .
-├── .flake8
 ├── .github
 │   └── workflows
 │       ├── ci.yaml
@@ -66,22 +65,35 @@ publishing the code as a package and more.
 ├── config
 │   ├── __init__.py
 │   ├── config.yaml
-│   ├── dataset
-│   │   ├── common_voice_da.yaml
+│   ├── datasets
+│   │   ├── alvenir_test_set.yaml
+│   │   ├── common_voice_13_da.yaml
+│   │   ├── common_voice_13_nn.yaml
+│   │   ├── common_voice_13_sv.yaml
+│   │   ├── common_voice_9_da.yaml
+│   │   ├── fleurs_da.yaml
+│   │   ├── fleurs_nb.yaml
+│   │   ├── fleurs_sv.yaml
 │   │   ├── ftspeech.yaml
-│   │   └── test.yaml
+│   │   ├── nota.yaml
+│   │   ├── nst_da.yaml
+│   │   └── test_dataset.yaml
 │   ├── hydra
 │   │   └── job_logging
 │   │       └── custom.yaml
 │   └── model
-│       ├── test.yaml
+│       ├── test_wav2vec2.yaml
+│       ├── test_whisper.yaml
 │       ├── wav2vec2.yaml
-│       ├── wav2vec2_with_lm.yaml
-│       └── whisper.yaml
-├── data
+│       ├── whisper_large.yaml
+│       ├── whisper_medium.yaml
+│       ├── whisper_small.yaml
+│       ├── whisper_xsmall.yaml
+│       └── whisper_xxsmall.yaml
+├── docs
+│   └── .gitkeep
 ├── makefile
-├── models
-├── notebooks
+├── poetry.lock
 ├── poetry.toml
 ├── pyproject.toml
 ├── src
@@ -90,15 +102,25 @@ publishing the code as a package and more.
 │   │   ├── compute_metrics.py
 │   │   ├── data.py
 │   │   ├── finetune.py
+│   │   ├── model_setup.py
+│   │   ├── plot.py
+│   │   ├── prepare_raw_data.py
 │   │   ├── protocols.py
 │   │   ├── utils.py
-│   │   └── wav2vec2.py
+│   │   ├── wav2vec2.py
+│   │   └── whisper.py
 │   └── scripts
+│       ├── build_coral_data.py
 │       ├── build_ftspeech.py
-│       ├── evaluate.py
-│       ├── finetune.py
+│       ├── build_nota.py
+│       ├── build_nst_da.py
+│       ├── download_ftspeech.py
+│       ├── evaluate_model.py
+│       ├── find_faulty_audio_clips.py
+│       ├── finetune_model.py
 │       ├── fix_dot_env_file.py
-│       ├── push_ftspeech_to_hub.py
+│       ├── plot_training_trajectory.py
+│       ├── push_to_hub.py
 │       ├── train_ngram_decoder.py
 │       └── versioning.py
 └── tests
@@ -109,5 +131,6 @@ publishing the code as a package and more.
     ├── test_finetune.py
     ├── test_protocols.py
     ├── test_utils.py
-    └── test_wav2vec2.py
+    ├── test_wav2vec2.py
+    └── test_whisper.py
 ```
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,8 +1,7 @@
 defaults:
   - model: wav2vec2
   - datasets:
-    - nst_da
-    - common_voice_9_da
+    - common_voice_13_da
   - override hydra/job_logging: custom
   - _self_
 
@@ -16,7 +15,13 @@ dirs:
 seed: 4242
 
 # Dataset parameters
-dataset_probabilities:  # null = equal probability to every dataset
+characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü '
+max_seconds_per_example: 10
+dataloader_num_workers: 8
+
+# This is a list of the sampling probability of each dataset, where null means that
+# each dataset will be sampled equally often
+dataset_probabilities:
   train: null
   val: null
   test: null
@@ -26,6 +31,7 @@ pipeline_id: ${model.name}-finetuned
 hub_id: alexandrainst/${pipeline_id}
 model_dir: ${dirs.models}/${pipeline_id}
 push_to_hub: false
+fp16: true
 
 # Training parameters
 wandb: false
@@ -34,10 +40,18 @@ wandb_group: default
 wandb_name: null
 resume_from_checkpoint: false
 ignore_data_skip: false
+save_total_limit: 2
+
+# Optimisation parameters
+learning_rate: 3e-5
+adam_first_momentum: 0.9
+adam_second_momentum: 0.98
+batch_size: 8
+gradient_accumulation: 32
+max_steps: 50_000
+warmup_steps: 1_000
 logging_steps: 10
 eval_steps: 100
 save_steps: 100
-save_total_limit: 2
 early_stopping: false
 early_stopping_patience: 50
-fp16: true
diff --git a/config/datasets/alvenir_test_set.yaml b/config/datasets/alvenir_test_set.yaml
@@ -0,0 +1,7 @@
+alvenir_test_set:
+  id: Alvenir/alvenir_asr_da_eval
+  subset: null
+  train_name: null
+  val_name: null
+  test_name: test
+  text_column: sentence
diff --git a/config/hydra/job_logging/custom.yaml b/config/hydra/job_logging/custom.yaml
@@ -9,7 +9,11 @@ handlers:
     class: logging.StreamHandler
     formatter: simple
     stream: ext://sys.stdout
+  file:
+    class: logging.FileHandler
+    formatter: simple
+    filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
 root:
-  handlers: [console]
+  handlers: [console, file]
 
 disable_existing_loggers: false
diff --git a/config/model/test_wav2vec2.yaml b/config/model/test_wav2vec2.yaml
@@ -5,7 +5,6 @@ freeze_feature_encoder: true
 
 # Data hyperparameters
 clean_dataset: true
-characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü '
 
 # Model hyperparameters
 sampling_rate: 16_000
@@ -23,15 +22,3 @@ ctc_loss_reduction: sum
 
 # Decoder hyperparameters
 language_model_decoder: null
-
-# Training hyperparameters
-batch_size: 1
-gradient_accumulation: 1
-max_steps: 3
-learning_rate: 4e-5
-warmup_steps: 1
-early_stopping: true
-early_stopping_patience: 5
-adam_first_momentum: 0.9
-adam_second_momentum: 0.999
-fp16: false
diff --git a/config/model/test_whisper.yaml b/config/model/test_whisper.yaml
@@ -15,14 +15,4 @@ mask_time_prob: 0.5
 mask_time_length: 10
 mask_feature_prob: 0.5
 mask_feature_length: 64
-
-# Training hyperparameters
-batch_size: 1
-gradient_accumulation: 1
-max_steps: 3
-learning_rate: 4e-5
-warmup_steps: 1
-early_stopping: true
-early_stopping_patience: 5
-fp16: false
-generation_max_length: 1
+generation_max_length: 128
diff --git a/config/model/wav2vec2.yaml b/config/model/wav2vec2.yaml
@@ -5,7 +5,6 @@ freeze_feature_encoder: false
 
 # Data hyperparameters
 clean_dataset: true
-characters_to_keep: 'abcdefghijklmnopqrstuvwxyzæøå0123456789éü '
 
 # Model hyperparameters
 sampling_rate: 16_000
@@ -15,9 +14,9 @@ hidden_dropout: 0.0
 feat_proj_dropout: 0.0
 feat_quantizer_dropout: 0.0
 final_dropout: 0.0
-mask_time_prob: 0.5
+mask_time_prob: 0.3
 mask_time_length: 10
-mask_feature_prob: 0.5
+mask_feature_prob: 0.3
 mask_feature_length: 64
 layerdrop: 0.1
 ctc_loss_reduction: mean
@@ -29,12 +28,3 @@ decoder:
   dataset_subset: null
   dataset_split: train
   n: 5
-
-# Training hyperparameters
-batch_size: 8
-gradient_accumulation: 32
-max_steps: 13_000  # Based on the XLS-R paper, section 4.3
-warmup_steps: 1_300  # Based on the XLS-R paper, section 4.3
-learning_rate: 3e-5
-adam_first_momentum: 0.9
-adam_second_momentum: 0.98
diff --git a/config/model/wav2vec2_no_reg.yaml b/config/model/wav2vec2_no_reg.yaml
diff --git a/config/model/whisper_large.yaml b/config/model/whisper_large.yaml
@@ -8,18 +8,11 @@ clean_dataset: false
 
 # Model hyperparameters
 sampling_rate: 16_000
-dropout: 0.1
-activation_dropout: 0.1
-attention_dropout: 0.1
-mask_time_prob: 0.5
+dropout: 0.0
+activation_dropout: 0.0
+attention_dropout: 0.0
+mask_time_prob: 0.3
 mask_time_length: 10
-mask_feature_prob: 0.5
+mask_feature_prob: 0.3
 mask_feature_length: 64
-
-# Training hyperparameters
-batch_size: 1
-gradient_accumulation: 32
-max_steps: 120_000
-learning_rate: 3e-5
-warmup_steps: 500
-generation_max_length: 225
+generation_max_length: 128