From 7bfd03c80684487b99bd79ced1ab2168956336fa Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Fri, 10 Nov 2023 10:08:07 -0700 Subject: [PATCH] Load encoding from libreoffice dictionaries Based on https://github.com/behdad/halfkern/issues/4 --- kern_pair.py | 14 ++++++++++++-- kern_triples.py | 8 ++++++-- ngrams.py | 38 ++++++++++++++++++++++++++++---------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/kern_pair.py b/kern_pair.py index 8500943..1ef1ccf 100644 --- a/kern_pair.py +++ b/kern_pair.py @@ -192,7 +192,15 @@ def surface_sum(surface, func=sum): def kern_pair( - l, r, min_overlap, max_overlap, *, reduce=max, envelope="sdf", blurred=False, half=True + l, + r, + min_overlap, + max_overlap, + *, + reduce=max, + envelope="sdf", + blurred=False, + half=True ): old_l_surface = l.surface old_r_surface = r.surface @@ -502,7 +510,9 @@ def find_s(*, reduce=max, envelope="sdf"): all_bigrams = defaultdict(int) for dictfile in options.dict or []: - this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding) + this_bigrams = ngrams.extract_ngrams_from_file( + dictfile, 2, cutoff=cutoff, encoding=encoding + ) for k, v in this_bigrams.items(): all_bigrams[k] += v for bigram in all_bigrams: diff --git a/kern_triples.py b/kern_triples.py index 5fb1bb6..4a6abe1 100644 --- a/kern_triples.py +++ b/kern_triples.py @@ -69,10 +69,14 @@ def create_blurred_surface_for_text(text): all_bigrams = defaultdict(int) all_trigrams = defaultdict(int) for dictfile in dictfiles: - this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding) + this_bigrams = ngrams.extract_ngrams_from_file( + dictfile, 2, cutoff=cutoff, encoding=encoding + ) for k, v in this_bigrams.items(): all_bigrams[k] += v - this_trigrams = ngrams.extract_ngrams_from_file(dictfile, 3, cutoff=cutoff, encoding=encoding) + this_trigrams = ngrams.extract_ngrams_from_file( + dictfile, 3, cutoff=cutoff, encoding=encoding + ) for k, v in this_trigrams.items(): all_trigrams[k] += v diff --git a/ngrams.py b/ngrams.py index 7e8b8d0..639b05d 100644 --- a/ngrams.py +++ b/ngrams.py @@ -5,7 +5,9 @@ LETTERS_ONLY = False -def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8"): +def extract_ngrams( + text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8" +): if frequencies is None: frequencies = itertools.cycle([min_freq]) @@ -28,7 +30,6 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ words = [word[i:] for i in range(n)] for ngram in zip(*words): - ngram = "".join(ngram) ngrams[ngram] += freq @@ -52,17 +53,34 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ def extract_ngrams_from_file(filename, *kargs, **kwargs): + frqfile = None try: txtfile = open(filename, "rb") - # Assume hunspell dictionary format; drop everything after "/" - txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile) - frqfile = None except FileNotFoundError: - import bz2 - - # Assume harfbuzz-testing-wikipedia format - txtfile = bz2.open(filename + ".txt.bz2").read().splitlines() - frqfile = bz2.open(filename + ".frq.bz2").read().splitlines() + try: + import bz2 + + # Assume harfbuzz-testing-wikipedia format + txtfile = bz2.open(filename + ".txt.bz2").read().splitlines() + frqfile = bz2.open(filename + ".frq.bz2").read().splitlines() + except FileNotFoundError: + try: + # Assume hunspell dictionary format; + afffile = open(filename + ".aff", "rb") + for line in afffile: + if line.startswith(b"SET"): + kwargs["encoding"] = ( + line.replace(b"\t", b" ").split()[1].decode("ascii") + ) + break + txtfile = open(filename + ".dic", "rb") + next(txtfile) # Skip over the num entries line + txtfile = ( + s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile + ) + + except FileNotFoundError: + raise FileNotFoundError("File not found: %s" % filename) return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)