Replies: 1 comment
-
Most of the datasets you have mentioned are not genuinely targeted for training TTS models. I suppose they contain a lot of noise and are may not be perfectly aligned with the text. You may want to read https://tts.readthedocs.io/en/latest/what_makes_a_good_dataset.html and get some ideas how your data should look/sound like. |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hey all,
i desperately try to train a multispeaker german model. I successfully trained the encoder and tried to train several models: tacotron 2 dda. dca, glow tts and vits. i am using vox forge, distant speech, librivoxdeen and common voice - although i also ditched cv for some runs. Although my loss is constantly decreasing the resulting audio is awful.
Any help would be highly appreciated!
Here are some tb pics and my settings for a vits run:
{
![eval_stats](https://user-images.githubusercontent.com/67948001/172341779-ccd51df2-b562-48e8-93d3-b5dc17ad3b0c.png)
![train_epoch_stats](https://user-images.githubusercontent.com/67948001/172341800-2217e522-1403-4151-82fe-5923dbdf70b4.png)
![train_figs](https://user-images.githubusercontent.com/67948001/172342482-9c3e2d44-de32-4b46-bb05-7e442e3d52be.png)
![test_figs](https://user-images.githubusercontent.com/67948001/172342488-ced24cde-e944-4cf9-8bfa-c4a2445bd35c.png)
![eval_fig](https://user-images.githubusercontent.com/67948001/172342489-91c71dea-aa14-4144-a75b-e51c0042a288.png)
"output_path": "/home/people/ojaggy/TTS_frog",
"logger_uri": null,
"run_name": "vits_vctk",
"project_name": null,
"run_description": "\ud83d\udc38Coqui trainer run.",
"print_step": 25,
"plot_step": 100,
"model_param_stats": false,
"wandb_entity": null,
"dashboard_logger": "tensorboard",
"log_model_step": 2500,
"save_step": 2500,
"save_n_checkpoints": 5,
"save_checkpoints": true,
"save_all_best": false,
"save_best_after": 10000,
"target_loss": null,
"print_eval": false,
"test_delay_epochs": -1,
"run_eval": true,
"distributed_backend": "nccl",
"distributed_url": "tcp://localhost:54321",
"mixed_precision": true,
"epochs": 1000,
"batch_size": 32,
"eval_batch_size": 16,
"grad_clip": [
1000,
1000
],
"scheduler_after_epoch": true,
"lr": 0.001,
"optimizer": "AdamW",
"optimizer_params": {
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"weight_decay": 0.01
},
"lr_scheduler": "",
"lr_scheduler_params": {},
"use_grad_scaler": false,
"cudnn_enable": true,
"cudnn_deterministic": false,
"cudnn_benchmark": true,
"training_seed": 54321,
"model": "vits",
"num_loader_workers": 4,
"num_eval_loader_workers": 4,
"use_noise_augment": false,
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_shift_ms": null,
"frame_length_ms": null,
"stft_pad_mode": "reflect",
"sample_rate": 22050,
"resample": false,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_sound_norm": false,
"log_func": "np.log",
"do_trim_silence": true,
"trim_db": 60,
"do_rms_norm": false,
"db_level": null,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 0.0,
"mel_fmax": 8000,
"spec_gain": 1,
"do_amp_to_db_linear": true,
"do_amp_to_db_mel": true,
"pitch_fmax": 640.0,
"pitch_fmin": 0.0,
"signal_norm": false,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": null
},
"use_phonemes": true,
"phonemizer": "gruut",
"phoneme_language": "de-de",
"compute_input_seq_cache": true,
"text_cleaner": "phoneme_cleaners",
"enable_eos_bos_chars": false,
"test_sentences_file": "",
"phoneme_cache_path": "/home/people/ojaggy/phoneme_cache",
"characters": {
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
"vocab_dict": null,
"pad": "",
"eos": "",
"bos": "",
"blank": "",
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
"punctuations": "!'(),-.:;? ",
"phonemes": null,
"is_unique": false,
"is_sorted": true
},
"add_blank": true,
"batch_group_size": 5,
"loss_masking": null,
"sort_by_audio_len": false,
"min_audio_len": 1,
"max_audio_len": Infinity,
"min_text_len": 3,
"max_text_len": 190,
"compute_f0": false,
"compute_linear_spec": true,
"precompute_num_workers": 0,
"start_by_longest": false,
"datasets": [
{
"name": "new_multi",
"path": "/home/people/ojaggy/TTS_frog/Dataset/new",
"meta_file_train": "meta_dist_voxdeen_vox_forge.txt",
"ignored_speakers": null,
"language": "de-de",
"meta_file_val": "",
"meta_file_attn_mask": ""
}
],
"test_sentences": [
"Es hat lange gedauert bis ich eine Stimme entwickelt habe und nun werde ich nicht schweigen.",
"Sei eine Stimme, kein Echon.",
"Es tut mir leid David. Das werde ich nicht tun.",
"Dieser Kuchen ist lecker. Er ist gut und nicht trocken",
"Vor dem 22 November, 1939"
],
"eval_split_max_size": null,
"eval_split_size": 0.01,
"use_speaker_weighted_sampler": true,
"speaker_weighted_sampler_alpha": 1.0,
"use_language_weighted_sampler": false,
"language_weighted_sampler_alpha": 1.0,
"model_args": {
"num_chars": 131,
"out_channels": 513,
"spec_segment_size": 32,
"hidden_channels": 192,
"hidden_channels_ffn_text_encoder": 768,
"num_heads_text_encoder": 2,
"num_layers_text_encoder": 6,
"kernel_size_text_encoder": 3,
"dropout_p_text_encoder": 0.1,
"dropout_p_duration_predictor": 0.5,
"kernel_size_posterior_encoder": 5,
"dilation_rate_posterior_encoder": 1,
"num_layers_posterior_encoder": 16,
"kernel_size_flow": 5,
"dilation_rate_flow": 1,
"num_layers_flow": 4,
"resblock_type_decoder": "1",
"resblock_kernel_sizes_decoder": [
3,
7,
11
],
"resblock_dilation_sizes_decoder": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates_decoder": [
8,
8,
2,
2
],
"upsample_initial_channel_decoder": 512,
"upsample_kernel_sizes_decoder": [
16,
16,
4,
4
],
"use_sdp": true,
"noise_scale": 1.0,
"inference_noise_scale": 0.667,
"length_scale": 1,
"noise_scale_dp": 1.0,
"inference_noise_scale_dp": 1.0,
"max_inference_len": null,
"init_discriminator": true,
"use_spectral_norm_disriminator": false,
"use_speaker_embedding": false,
"num_speakers": 16384,
"speakers_file": "/home/people/ojaggy/TTS_frog/vits_vctk-June-03-2022_06+00PM-c410bc58/speakers.json",
"d_vector_file": "d_vector.json",
"speaker_embedding_channels": 256,
"use_d_vector_file": true,
"d_vector_dim": 256,
"detach_dp_input": true,
"use_language_embedding": false,
"embedded_language_dim": 4,
"num_languages": 0,
"language_ids_file": null,
"use_speaker_encoder_as_loss": false,
"speaker_encoder_config_path": "",
"speaker_encoder_model_path": "",
"condition_dp_on_speaker": true,
"freeze_encoder": false,
"freeze_DP": false,
"freeze_PE": false,
"freeze_flow_decoder": false,
"freeze_waveform_decoder": false
},
"lr_gen": 0.0002,
"lr_disc": 0.0002,
"lr_scheduler_gen": "ExponentialLR",
"lr_scheduler_gen_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"lr_scheduler_disc": "ExponentialLR",
"lr_scheduler_disc_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"kl_loss_alpha": 1.0,
"disc_loss_alpha": 1.0,
"gen_loss_alpha": 1.0,
"feat_loss_alpha": 1.0,
"mel_loss_alpha": 45.0,
"dur_loss_alpha": 1.0,
"speaker_encoder_loss_alpha": 1.0,
"return_wav": true,
"r": 1,
"num_speakers": 16384,
"use_speaker_embedding": false,
"speakers_file": "/home/people/ojaggy/TTS_frog/vits_vctk-June-03-2022_06+00PM-c410bc58/speakers.json",
"speaker_embedding_channels": 256,
"language_ids_file": null,
"use_language_embedding": false,
"use_d_vector_file": true,
"d_vector_file": "d_vector.json",
"d_vector_dim": 256
}
Beta Was this translation helpful? Give feedback.
All reactions