Skip to content

Commit

Permalink
Add unicode normalize to every cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
shavit committed Sep 30, 2024
1 parent 636ea59 commit 4ea85b0
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions TTS/tts/utils/text/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def replace_symbols(text: str, lang: Optional[str] = "en") -> str:

def basic_cleaners(text: str) -> str:
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = normalize_unicode(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text


def transliteration_cleaners(text: str) -> str:
"""Pipeline for non-English text that transliterates to ASCII."""
text = normalize_unicode(text)
# text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
Expand All @@ -97,6 +99,7 @@ def transliteration_cleaners(text: str) -> str:

def basic_german_cleaners(text: str) -> str:
"""Pipeline for German text"""
text = normalize_unicode(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
Expand All @@ -105,6 +108,7 @@ def basic_german_cleaners(text: str) -> str:
# TODO: elaborate it
def basic_turkish_cleaners(text: str) -> str:
"""Pipeline for Turkish text"""
text = normalize_unicode(text)
text = text.replace("I", "ı")
text = lowercase(text)
text = collapse_whitespace(text)
Expand All @@ -113,6 +117,7 @@ def basic_turkish_cleaners(text: str) -> str:

def english_cleaners(text: str) -> str:
"""Pipeline for English text, including number and abbreviation expansion."""
text = normalize_unicode(text)
# text = convert_to_ascii(text)
text = lowercase(text)
text = expand_time_english(text)
Expand All @@ -130,6 +135,7 @@ def phoneme_cleaners(text: str) -> str:
NB: This cleaner converts numbers into English words, for other languages
use multilingual_phoneme_cleaners().
"""
text = normalize_unicode(text)
text = en_normalize_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
Expand All @@ -140,6 +146,7 @@ def phoneme_cleaners(text: str) -> str:

def multilingual_phoneme_cleaners(text: str) -> str:
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
text = normalize_unicode(text)
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
Expand All @@ -148,6 +155,7 @@ def multilingual_phoneme_cleaners(text: str) -> str:

def french_cleaners(text: str) -> str:
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = normalize_unicode(text)
text = expand_abbreviations(text, lang="fr")
text = lowercase(text)
text = replace_symbols(text, lang="fr")
Expand All @@ -159,6 +167,7 @@ def french_cleaners(text: str) -> str:
def portuguese_cleaners(text: str) -> str:
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that"""
text = normalize_unicode(text)
text = lowercase(text)
text = replace_symbols(text, lang="pt")
text = remove_aux_symbols(text)
Expand All @@ -168,12 +177,14 @@ def portuguese_cleaners(text: str) -> str:

def chinese_mandarin_cleaners(text: str) -> str:
"""Basic pipeline for chinese"""
text = normalize_unicode(text)
text = replace_numbers_to_characters_in_text(text)
return text


def multilingual_cleaners(text: str) -> str:
"""Pipeline for multilingual text"""
text = normalize_unicode(text)
text = lowercase(text)
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
Expand Down

0 comments on commit 4ea85b0

Please sign in to comment.