From 9d1e9bf397d912854c8bff9e05cda5d84d29b0be Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Fri, 10 Nov 2023 10:08:07 -0700 Subject: [PATCH] Load encoding from libreoffice dictionaries Based on https://github.com/behdad/halfkern/issues/4 --- ngrams.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/ngrams.py b/ngrams.py index 7e8b8d0..606f971 100644 --- a/ngrams.py +++ b/ngrams.py @@ -52,17 +52,30 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ def extract_ngrams_from_file(filename, *kargs, **kwargs): + encoding = 'utf-8' + frqfile = None try: txtfile = open(filename, "rb") - # Assume hunspell dictionary format; drop everything after "/" - txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile) - frqfile = None except FileNotFoundError: - import bz2 - - # Assume harfbuzz-testing-wikipedia format - txtfile = bz2.open(filename + ".txt.bz2").read().splitlines() - frqfile = bz2.open(filename + ".frq.bz2").read().splitlines() + try: + import bz2 + + # Assume harfbuzz-testing-wikipedia format + txtfile = bz2.open(filename + ".txt.bz2").read().splitlines() + frqfile = bz2.open(filename + ".frq.bz2").read().splitlines() + except FileNotFoundError: + try: + # Assume hunspell dictionary format; + afffile = open(filename + ".aff", "rb") + for line in afffile: + if line.startswith(b"SET"): + encoding = line.replace(b"\t", b" ").split()[1].decode("ascii") + break + txtfile = open(filename + ".dic", "rb") + txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile) + + except FileNotFoundError: + raise FileNotFoundError("File not found: %s" % filename) return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)