Add unicode normalize to every cleaner

idiap · Sep 30, 2024 · 4ea85b0 · 4ea85b0
1 parent 636ea59
commit 4ea85b0
Showing 1 changed file with 11 additions and 0 deletions.
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
@@ -82,13 +82,15 @@ def replace_symbols(text: str, lang: Optional[str] = "en") -> str:
 
 def basic_cleaners(text: str) -> str:
     """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = normalize_unicode(text)
     text = lowercase(text)
     text = collapse_whitespace(text)
     return text
 
 
 def transliteration_cleaners(text: str) -> str:
     """Pipeline for non-English text that transliterates to ASCII."""
+    text = normalize_unicode(text)
     # text = convert_to_ascii(text)
     text = lowercase(text)
     text = collapse_whitespace(text)
@@ -97,6 +99,7 @@ def transliteration_cleaners(text: str) -> str:
 
 def basic_german_cleaners(text: str) -> str:
     """Pipeline for German text"""
+    text = normalize_unicode(text)
     text = lowercase(text)
     text = collapse_whitespace(text)
     return text
@@ -105,6 +108,7 @@ def basic_german_cleaners(text: str) -> str:
 # TODO: elaborate it
 def basic_turkish_cleaners(text: str) -> str:
     """Pipeline for Turkish text"""
+    text = normalize_unicode(text)
     text = text.replace("I", "ı")
     text = lowercase(text)
     text = collapse_whitespace(text)
@@ -113,6 +117,7 @@ def basic_turkish_cleaners(text: str) -> str:
 
 def english_cleaners(text: str) -> str:
     """Pipeline for English text, including number and abbreviation expansion."""
+    text = normalize_unicode(text)
     # text = convert_to_ascii(text)
     text = lowercase(text)
     text = expand_time_english(text)
@@ -130,6 +135,7 @@ def phoneme_cleaners(text: str) -> str:
     NB: This cleaner converts numbers into English words, for other languages
     use multilingual_phoneme_cleaners().
     """
+    text = normalize_unicode(text)
     text = en_normalize_numbers(text)
     text = expand_abbreviations(text)
     text = replace_symbols(text)
@@ -140,6 +146,7 @@ def phoneme_cleaners(text: str) -> str:
 
 def multilingual_phoneme_cleaners(text: str) -> str:
     """Pipeline for phonemes mode, including number and abbreviation expansion."""
+    text = normalize_unicode(text)
     text = replace_symbols(text, lang=None)
     text = remove_aux_symbols(text)
     text = collapse_whitespace(text)
@@ -148,6 +155,7 @@ def multilingual_phoneme_cleaners(text: str) -> str:
 
 def french_cleaners(text: str) -> str:
     """Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
+    text = normalize_unicode(text)
     text = expand_abbreviations(text, lang="fr")
     text = lowercase(text)
     text = replace_symbols(text, lang="fr")
@@ -159,6 +167,7 @@ def french_cleaners(text: str) -> str:
 def portuguese_cleaners(text: str) -> str:
     """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
     numbers, phonemizer already does that"""
+    text = normalize_unicode(text)
     text = lowercase(text)
     text = replace_symbols(text, lang="pt")
     text = remove_aux_symbols(text)
@@ -168,12 +177,14 @@ def portuguese_cleaners(text: str) -> str:
 
 def chinese_mandarin_cleaners(text: str) -> str:
     """Basic pipeline for chinese"""
+    text = normalize_unicode(text)
     text = replace_numbers_to_characters_in_text(text)
     return text
 
 
 def multilingual_cleaners(text: str) -> str:
     """Pipeline for multilingual text"""
+    text = normalize_unicode(text)
     text = lowercase(text)
     text = replace_symbols(text, lang=None)
     text = remove_aux_symbols(text)