diff --git a/data/ca-ba/lexicon.db b/data/ca-ba/lexicon.db index 3c3e57c..c961947 100644 Binary files a/data/ca-ba/lexicon.db and b/data/ca-ba/lexicon.db differ diff --git a/data/ca-no/lexicon.db b/data/ca-no/lexicon.db index 2b0cb8c..6cb9e0e 100644 Binary files a/data/ca-no/lexicon.db and b/data/ca-no/lexicon.db differ diff --git a/data/ca-va/lexicon.db b/data/ca-va/lexicon.db index fb8837f..5b6518d 100644 Binary files a/data/ca-va/lexicon.db and b/data/ca-va/lexicon.db differ diff --git a/gruut/lang.py b/gruut/lang.py index badb671..ccfffaa 100644 --- a/gruut/lang.py +++ b/gruut/lang.py @@ -2180,7 +2180,26 @@ def ca_post_process_sentence( nodes.append(typing.cast(PunctuationWordNode, node)) lang = identify_lang(nodes) - + + # HACK + # Training corpora includes an invalid sequence of phonemes: l ʎ l + # We fix that here, in the next iteration will be properly solved + phonemes_to_fix = "l ʎ l" + fixed_phonemes = "l l" + for node in nodes: + + if node is None: + continue + + if isinstance(node, WordNode): + if not (node.text and node.phonemes): + continue + phonemes_text = " ".join(node.phonemes) + if phonemes_to_fix in phonemes_text: + phonemes_text = phonemes_text.replace(phonemes_to_fix, fixed_phonemes) + node.phonemes = phonemes_text.split(" ") + _LOGGER.debug(f"FIX: phoneme sequence '{phonemes_to_fix}' fixed at {node.text}. Fixed transcription: {node.phonemes}") + # Create a list of contiguous word nodes contiguous_word_nodes = [] for node_1, node_2 in sliding_window(nodes, 2): @@ -2309,6 +2328,4 @@ def __call__( self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args) assert self.phonemizer is not None - return self.phonemizer(word, role=role, do_transforms=do_transforms) - - + return self.phonemizer(word, role=role, do_transforms=do_transforms) \ No newline at end of file