From 3ef6dd3de9a05cc681bd2f6b65e0f72e8d8702de Mon Sep 17 00:00:00 2001 From: Denis Moyogo Jacquerye Date: Tue, 1 Nov 2022 10:14:58 +0100 Subject: [PATCH] Test languages exemplars canonical duplicates --- tests/test_data_languages.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_data_languages.py b/tests/test_data_languages.py index 9c30318b..3f9adff8 100644 --- a/tests/test_data_languages.py +++ b/tests/test_data_languages.py @@ -45,6 +45,26 @@ @pytest.mark.parametrize( "exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"] ) +def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name): + lang = LANGUAGES[lang_code] + exemplar = getattr(lang.exemplar_chars, exemplar_name).split() + normalized = defaultdict(set) + + for g in exemplar: + if g[0] == "{" and g[-1] == "}": + g = g.lstrip("{").rstrip("}") + normalized[unicodedata.normalize("NFC", g)].add(g) + + result = [(len(gs), gs) for n, gs in normalized.items()] + expected = [(1, {n}) for n, gs in normalized.items()] + assert result == expected + + +@pytest.mark.parametrize("lang_code", LANGUAGES) +@pytest.mark.parametrize( + "exemplar_name", + ["base", "auxiliary", "marks", "numerals", "punctuation", "index"] +) def test_languages_exemplars_duplicates(lang_code, exemplar_name): lang = LANGUAGES[lang_code] exemplar = getattr(lang.exemplar_chars, exemplar_name).split()