Merge pull request #4 from idiap/hindi

feat(xtts): support Hindi for sentence-splitting and fine-tuning
idiap · Apr 11, 2024 · 2ad790d · 2ad790d
2 parents dfbe016 + d416865
commit 2ad790d
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 5 deletions.
diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py
@@ -192,6 +192,7 @@ def read_logs():
                     "hu",
                     "ko",
                     "ja",
+                    "hi",
                 ],
             )
             progress_data = gr.Label(label="Progress:")
@@ -370,6 +371,7 @@ def train_model(
                             "hu",
                             "ko",
                             "ja",
+                            "hi",
                         ],
                     )
                     tts_text = gr.Textbox(

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
@@ -12,6 +12,7 @@
 from spacy.lang.ar import Arabic
 from spacy.lang.en import English
 from spacy.lang.es import Spanish
+from spacy.lang.hi import Hindi
 from spacy.lang.ja import Japanese
 from spacy.lang.zh import Chinese
 from tokenizers import Tokenizer
@@ -22,6 +23,7 @@
 
 
 def get_spacy_lang(lang):
+    """Return Spacy language used for sentence splitting."""
     if lang == "zh":
         return Chinese()
     elif lang == "ja":
@@ -30,8 +32,10 @@ def get_spacy_lang(lang):
         return Arabic()
     elif lang == "es":
         return Spanish()
+    elif lang == "hi":
+        return Hindi()
     else:
-        # For most languages, Enlish does the job
+        # For most languages, English does the job
         return English()
 
 
@@ -614,6 +618,7 @@ def __init__(self, vocab_file=None):
             "ja": 71,
             "hu": 224,
             "ko": 95,
+            "hi": 150,
         }
 
     @cached_property

diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
@@ -14,16 +14,31 @@ There is no need for an excessive amount of training data that spans countless h
 ### Updates with v2
 - Improved voice cloning.
 - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime.
-- 2 new languages: Hungarian and Korean.
 - Across the board quality improvements.
 
 ### Code
 Current implementation only supports inference and GPT encoder training.
 
 ### Languages
-As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
-
-Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
+XTTS-v2 supports 17 languages:
+
+- Arabic (ar)
+- Chinese (zh-cn)
+- Czech (cs)
+- Dutch (nl)
+- English (en)
+- French (fr)
+- German (de)
+- Hindi (hi)
+- Hungarian (hu)
+- Italian (it)
+- Japanese (ja)
+- Korean (ko)
+- Polish (pl)
+- Portuguese (pt)
+- Russian (ru)
+- Spanish (es)
+- Turkish (tr)
 
 ### License
 This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).