Skip to content

Commit

Permalink
Test languages exemplars canonical duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
moyogo committed Apr 28, 2023
1 parent 9434451 commit 3ef6dd3
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions tests/test_data_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,26 @@
@pytest.mark.parametrize(
"exemplar_name", ["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_canonical_duplicates(lang_code, exemplar_name):
lang = LANGUAGES[lang_code]
exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
normalized = defaultdict(set)

for g in exemplar:
if g[0] == "{" and g[-1] == "}":
g = g.lstrip("{").rstrip("}")
normalized[unicodedata.normalize("NFC", g)].add(g)

result = [(len(gs), gs) for n, gs in normalized.items()]
expected = [(1, {n}) for n, gs in normalized.items()]
assert result == expected


@pytest.mark.parametrize("lang_code", LANGUAGES)
@pytest.mark.parametrize(
"exemplar_name",
["base", "auxiliary", "marks", "numerals", "punctuation", "index"]
)
def test_languages_exemplars_duplicates(lang_code, exemplar_name):
lang = LANGUAGES[lang_code]
exemplar = getattr(lang.exemplar_chars, exemplar_name).split()
Expand Down

0 comments on commit 3ef6dd3

Please sign in to comment.