From 910e5db9317e62a539aa532ab46021cd3ee7932f Mon Sep 17 00:00:00 2001 From: Yuekai Zhang Date: Thu, 22 Feb 2024 15:55:01 +0800 Subject: [PATCH] add manifests for whisper --- .../ASR/local/compute_fbank_aishell4.py | 20 +++-- egs/aishell4/ASR/prepare.sh | 27 ++++--- .../ASR/local/compute_fbank_alimeeting.py | 20 +++-- egs/alimeeting/ASR/prepare.sh | 22 ++--- egs/multi_zh-hans/ASR/prepare.sh | 80 ++++++++++++++----- 5 files changed, 115 insertions(+), 54 deletions(-) diff --git a/egs/aishell4/ASR/local/compute_fbank_aishell4.py b/egs/aishell4/ASR/local/compute_fbank_aishell4.py index f191639888..235efe7e1a 100755 --- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py +++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py @@ -29,7 +29,7 @@ from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import ChunkedLilcomHdf5Writer, CutSet, WhisperFbank, WhisperFbankConfig, Fbank, FbankConfig from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor, str2bool @@ -42,10 +42,10 @@ torch.set_num_interop_threads(1) -def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False): +def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False): src_dir = Path("data/manifests/aishell4") output_dir = Path("data/fbank") - num_jobs = min(15, os.cpu_count()) + num_jobs = min(8, os.cpu_count()) dataset_parts = ( "train_S", @@ -70,7 +70,10 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False): dataset_parts, ) - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + if whisper_fbank: + extractor = WhisperFbank(WhisperFbankConfig(num_filters=num_mel_bins, device='cuda')) + else: + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): @@ -121,7 +124,12 @@ def get_args(): default=False, help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.", ) - + parser.add_argument( + "--whisper-fbank", + type=str2bool, + default=False, + help="Use WhisperFbank instead of Fbank. Default: False.", + ) return parser.parse_args() @@ -132,5 +140,5 @@ def get_args(): args = get_args() compute_fbank_aishell4( - num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed + num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed, whisper_fbank=args.whisper_fbank ) diff --git a/egs/aishell4/ASR/prepare.sh b/egs/aishell4/ASR/prepare.sh index e8d9eb7b97..945805a979 100755 --- a/egs/aishell4/ASR/prepare.sh +++ b/egs/aishell4/ASR/prepare.sh @@ -5,8 +5,8 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail -stage=-1 -stop_stage=100 +stage=20 +stop_stage=20 perturb_speed=true @@ -76,7 +76,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Process aishell4" + log "Stage 2: Compute fbank for aishell4" if [ ! -f data/fbank/aishell4/.fbank.done ]; then mkdir -p data/fbank/aishell4 ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} @@ -84,6 +84,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then fi fi +whisper_mel_bins=80 +if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then + log "Stage 20: Compute whisper fbank for aishell4" + if [ ! -f data/fbank/aishell4/.fbank.done ]; then + mkdir -p data/fbank/aishell4 + ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/aishell4/.fbank.done + fi +fi + if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare musan manifest" # We assume that you have downloaded the musan corpus @@ -106,16 +116,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute fbank for aishell4" - if [ ! -f data/fbank/.aishell4.done ]; then - mkdir -p data/fbank - ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} - touch data/fbank/.aishell4.done - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare char based lang" + log "Stage 5: Prepare char based lang" lang_char_dir=data/lang_char mkdir -p $lang_char_dir diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py index f8c10648a5..32b4173a5a 100755 --- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py +++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py @@ -29,7 +29,7 @@ from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter +from lhotse import CutSet, WhisperFbank, WhisperFbankConfig, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor, str2bool @@ -42,10 +42,10 @@ torch.set_num_interop_threads(1) -def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False): +def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False): src_dir = Path("data/manifests/alimeeting") output_dir = Path("data/fbank") - num_jobs = min(15, os.cpu_count()) + num_jobs = min(8, os.cpu_count()) dataset_parts = ( "train", @@ -70,7 +70,10 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False dataset_parts, ) - extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + if whisper_fbank: + extractor = WhisperFbank(WhisperFbankConfig(num_filters=num_mel_bins, device='cuda')) + else: + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): @@ -121,7 +124,12 @@ def get_args(): default=False, help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.", ) - + parser.add_argument( + "--whisper-fbank", + type=str2bool, + default=False, + help="Use the Whisper Fbank feature extractor. Default: False.", + ) return parser.parse_args() @@ -132,5 +140,5 @@ def get_args(): args = get_args() compute_fbank_alimeeting( - num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed + num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed, whisper_fbank=args.whisper_fbank ) diff --git a/egs/alimeeting/ASR/prepare.sh b/egs/alimeeting/ASR/prepare.sh index c8fed658d7..3026c6dd61 100755 --- a/egs/alimeeting/ASR/prepare.sh +++ b/egs/alimeeting/ASR/prepare.sh @@ -66,13 +66,22 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Process alimeeting" + log "Stage 2: compute fbank for alimeeting" if [ ! -f data/fbank/alimeeting/.fbank.done ]; then mkdir -p data/fbank/alimeeting ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} fi fi +whisper_mel_bins=80 +if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then + log "Stage 20: compute whisper fbank for alimeeting" + if [ ! -f data/fbank/alimeeting/.fbank.done ]; then + mkdir -p data/fbank/alimeeting + ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + fi +fi + if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare musan manifest" # We assume that you have downloaded the musan corpus @@ -95,16 +104,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute fbank for alimeeting" - if [ ! -f data/fbank/.alimeeting.done ]; then - mkdir -p data/fbank - ./local/compute_fbank_alimeeting.py --perturb-speed True - touch data/fbank/.alimeeting.done - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare char based lang" + log "Stage 5: Prepare char based lang" lang_char_dir=data/lang_char mkdir -p $lang_char_dir diff --git a/egs/multi_zh-hans/ASR/prepare.sh b/egs/multi_zh-hans/ASR/prepare.sh index 180919a698..9cab26a136 100755 --- a/egs/multi_zh-hans/ASR/prepare.sh +++ b/egs/multi_zh-hans/ASR/prepare.sh @@ -60,7 +60,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ ! -f data/fbank/.thchs30.done ]; then mkdir -p data/fbank - ./local/compute_fbank_thchs30.py + ./local/compute_fbank_thchs30.py --speed-perturb true touch data/fbank/.thchs30.done fi fi @@ -137,7 +137,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then if [ ! -f data/fbank/.stcmds.done ]; then mkdir -p data/fbank - ./local/compute_fbank_stcmds.py + ./local/compute_fbank_stcmds.py --speed-perturb true touch data/fbank/.stcmds.done fi fi @@ -151,15 +151,15 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then lhotse download primewords $dl_dir/primewords fi - if [ ! -f data/manifests/.stcmds.done ]; then + if [ ! -f data/manifests/.primewords.done ]; then mkdir -p data/manifests - lhotse prepare stcmds $dl_dir/primewords data/manifests/primewords + lhotse prepare primewords $dl_dir/primewords data/manifests/primewords touch data/manifests/.primewords.done fi if [ ! -f data/fbank/.primewords.done ]; then mkdir -p data/fbank - ./local/compute_fbank_primewords.py + ./local/compute_fbank_primewords.py --speed-perturb true touch data/fbank/.primewords.done fi fi @@ -180,7 +180,7 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then if [ ! -f data/fbank/.magicdata.done ]; then mkdir -p data/fbank - ./local/compute_fbank_magicdata.py + ./local/compute_fbank_magicdata.py --speed-perturb true touch data/fbank/.magicdata.done fi fi @@ -291,10 +291,10 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then fi log "Compute KeSpeech fbank for train_phase1" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase1 + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase1 log "Compute KeSpeech fbank for train_phase2" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2 + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase2 log "Compute KeSpeech fbank for test/dev" ./local/compute_fbank_kespeech_dev_test.py @@ -344,10 +344,10 @@ if [ $stage -le 120 ] && [ $stop_stage -ge 120 ]; then fi log "Compute KeSpeech fbank for train_phase1" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase1 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase1 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true log "Compute KeSpeech fbank for train_phase2" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase2 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true log "Compute KeSpeech fbank for test/dev" ./local/compute_fbank_kespeech_dev_test.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true @@ -356,19 +356,63 @@ if [ $stage -le 120 ] && [ $stop_stage -ge 120 ]; then fi fi -if [ $stage -le 121 ] && [ $stop_stage -ge 121 ]; then - log "Stage 121: tmp" - log "Compute KeSpeech fbank for train_phase1" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --stop 1 --training-subset train_phase1 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true +if [ $stage -le 122 ] && [ $stop_stage -ge 122 ]; then + log "Stage 122: Prepare speed perturb versionKeSpeech for whisper" + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase1 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true log "Compute KeSpeech fbank for train_phase2" - ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + ./local/compute_fbank_kespeech_splits.py --speed-perturb true --num-splits ${num_splits} --training-subset train_phase2 --num-mel-bins ${whisper_mel_bins} --whisper-fbank true +fi - log "Compute KeSpeech fbank for test/dev" - ./local/compute_fbank_kespeech_dev_test.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true +if [ $stage -le 121 ] && [ $stop_stage -ge 121 ]; then + log "Stage 121: Prepare MagicData, Primewords, ST-CMDS, THCHS-30 for whisper" - touch data/fbank/.kespeech.done + if [ ! -f data/manifests/.magicdata.done ]; then + mkdir -p data/manifests + lhotse prepare magicdata $dl_dir/magicdata data/manifests/magicdata + touch data/manifests/.magicdata.done + fi + + if [ ! -f data/manifests/.primewords.done ]; then + mkdir -p data/manifests + lhotse prepare primewords $dl_dir/primewords data/manifests/primewords + touch data/manifests/.primewords.done + fi + if [ ! -f data/manifests/.stcmds.done ]; then + mkdir -p data/manifests + lhotse prepare stcmds $dl_dir/stcmds data/manifests/stcmds + touch data/manifests/.stcmds.done + fi + + if [ ! -f data/manifests/.thchs30.done ]; then + mkdir -p data/manifests + lhotse prepare thchs-30 $dl_dir/thchs30 data/manifests/thchs30 + touch data/manifests/.thchs30.done + fi + + if [ ! -f data/fbank/.thchs30.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_thchs30.py --speed-perturb true --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/.thchs30.done fi + + if [ ! -f data/fbank/.stcmds.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_stcmds.py --speed-perturb true --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/.stcmds.done + fi + if [ ! -f data/fbank/.magicdata.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_magicdata.py --speed-perturb true --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/.magicdata.done + fi + + if [ ! -f data/fbank/.primewords.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_primewords.py --speed-perturb true --num-mel-bins ${whisper_mel_bins} --whisper-fbank true + touch data/fbank/.primewords.done + fi + fi