forked from yl4579/StyleTTS2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
111 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
log_dir: "Models/LJSpeech" | ||
save_freq: 5 | ||
log_interval: 10 | ||
device: "cuda" | ||
epochs: 50 # number of finetuning epoch (1 hour of data) | ||
batch_size: 8 | ||
max_len: 400 # maximum number of frames | ||
pretrained_model: "Models/LibriTTS/epochs_2nd_00020.pth" | ||
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage | ||
load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters | ||
|
||
F0_path: "Utils/JDC/bst.t7" | ||
ASR_config: "Utils/ASR/config.yml" | ||
ASR_path: "Utils/ASR/epoch_00080.pth" | ||
PLBERT_dir: 'Utils/PLBERT/' | ||
|
||
data_params: | ||
train_data: "Data/train_list.txt" | ||
val_data: "Data/val_list.txt" | ||
root_path: "/local/LJSpeech-1.1/wavs" | ||
OOD_data: "Data/OOD_texts.txt" | ||
min_length: 50 # sample until texts with this size are obtained for OOD texts | ||
|
||
preprocess_params: | ||
sr: 24000 | ||
spect_params: | ||
n_fft: 2048 | ||
win_length: 1200 | ||
hop_length: 300 | ||
|
||
model_params: | ||
multispeaker: true | ||
|
||
dim_in: 64 | ||
hidden_dim: 512 | ||
max_conv_dim: 512 | ||
n_layer: 3 | ||
n_mels: 80 | ||
|
||
n_token: 178 # number of phoneme tokens | ||
max_dur: 50 # maximum duration of a single phoneme | ||
style_dim: 128 # style vector size | ||
|
||
dropout: 0.2 | ||
|
||
# config for decoder | ||
decoder: | ||
type: 'hifigan' # either hifigan or istftnet | ||
resblock_kernel_sizes: [3,7,11] | ||
upsample_rates : [10,5,3,2] | ||
upsample_initial_channel: 512 | ||
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] | ||
upsample_kernel_sizes: [20,10,6,4] | ||
|
||
# speech language model config | ||
slm: | ||
model: 'microsoft/wavlm-base-plus' | ||
sr: 16000 # sampling rate of SLM | ||
hidden: 768 # hidden size of SLM | ||
nlayers: 13 # number of layers of SLM | ||
initial_channel: 64 # initial channels of SLM discriminator head | ||
|
||
# style diffusion model config | ||
diffusion: | ||
embedding_mask_proba: 0.1 | ||
# transformer config | ||
transformer: | ||
num_layers: 3 | ||
num_heads: 8 | ||
head_features: 64 | ||
multiplier: 2 | ||
|
||
# diffusion distribution config | ||
dist: | ||
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false | ||
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true | ||
mean: -3.0 | ||
std: 1.0 | ||
|
||
loss_params: | ||
lambda_mel: 5. # mel reconstruction loss | ||
lambda_gen: 1. # generator loss | ||
lambda_slm: 1. # slm feature matching loss | ||
|
||
lambda_mono: 1. # monotonic alignment loss (TMA) | ||
lambda_s2s: 1. # sequence-to-sequence loss (TMA) | ||
|
||
lambda_F0: 1. # F0 reconstruction loss | ||
lambda_norm: 1. # norm reconstruction loss | ||
lambda_dur: 1. # duration loss | ||
lambda_ce: 20. # duration predictor probability output CE loss | ||
lambda_sty: 1. # style reconstruction loss | ||
lambda_diff: 1. # score matching loss | ||
|
||
diff_epoch: 10 # style diffusion starting epoch | ||
joint_epoch: 30 # joint training starting epoch | ||
|
||
optimizer_params: | ||
lr: 0.0001 # general learning rate | ||
bert_lr: 0.00001 # learning rate for PLBERT | ||
ft_lr: 0.0001 # learning rate for acoustic modules | ||
|
||
slmadv_params: | ||
min_len: 400 # minimum length of samples | ||
max_len: 500 # maximum length of samples | ||
batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size | ||
iter: 10 # update the discriminator every this iterations of generator update | ||
thresh: 5 # gradient norm above which the gradient is scaled | ||
scale: 0.01 # gradient scaling factor for predictors from SLM discriminators | ||
sig: 1.5 # sigma for differentiable duration modeling | ||
|