From 9e481e65fbe7541aa964c092bd3b2fb79ef1f8c8 Mon Sep 17 00:00:00 2001 From: Denis Moyogo Jacquerye Date: Mon, 31 Oct 2022 17:32:48 +0100 Subject: [PATCH] Add snippets/fix-exemplars-duplicates.py --- snippets/fix-exemplars-duplicates.py | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 snippets/fix-exemplars-duplicates.py diff --git a/snippets/fix-exemplars-duplicates.py b/snippets/fix-exemplars-duplicates.py new file mode 100644 index 00000000..fdc0f6a4 --- /dev/null +++ b/snippets/fix-exemplars-duplicates.py @@ -0,0 +1,63 @@ +from collections import Counter +from google.protobuf import text_format +from gflanguages import languages_public_pb2 + +ATTRIBUTES = "base auxiliary marks punctuation index".split(" ") + + +def main(args=None): + for path in args: + with open(path, encoding="utf-8") as fp: + language = text_format.Parse( + fp.read(), languages_public_pb2.LanguageProto() + ) + changed = False + exemplar_values = {} + if not hasattr(language, "exemplar_chars"): + exit() + for attr in ATTRIBUTES: + if hasattr(language.exemplar_chars, attr): + values = getattr(language.exemplar_chars, attr).split(" ") + value_set = set() + clean_values = [] + for value in values: + if value in value_set: + continue + else: + value_set.add(value) + clean_values.append(value) + + if clean_values != values: + if {len(set(values))} != {len(set(clean_values))}: + print("before: " + " ".join(values)) + print("after: " + " ".join(clean_values)) + sys.exit("Failed fixing exemplar.") + setattr(language.exemplar_chars, attr, " ".join(clean_values)) + changed = True + exemplar_values[attr] = { + "before": values, + "after": clean_values + } + + if changed: + for exemplar, values in exemplar_values.items(): + before = values["before"] + after = values["after"] + counter = Counter(before) + duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1] + print( + f"Changed {path} {exemplar} exemplar:\n" + f"- from {len(before)} ({len(set(before))} as set) " + f"to {len(after)} elements\n" + f"- removing {len(before) - len(after)} duplicate(s):\n" + f" {duplicates}\n" + ) + with open(path, "w", encoding="utf-8") as fp: + fp.write(text_format.MessageToString(language, as_utf8=True)) + fp.close() + + +if __name__ == "__main__": + import sys + + main(args=sys.argv[1:])