Load encoding from libreoffice dictionaries

Based on #4
behdad · Nov 10, 2023 · 7bfd03c · 7bfd03c
1 parent 5e14c60
commit 7bfd03c
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 14 deletions.
diff --git a/kern_pair.py b/kern_pair.py
@@ -192,7 +192,15 @@ def surface_sum(surface, func=sum):
 
 
 def kern_pair(
-    l, r, min_overlap, max_overlap, *, reduce=max, envelope="sdf", blurred=False, half=True
+    l,
+    r,
+    min_overlap,
+    max_overlap,
+    *,
+    reduce=max,
+    envelope="sdf",
+    blurred=False,
+    half=True
 ):
     old_l_surface = l.surface
     old_r_surface = r.surface
@@ -502,7 +510,9 @@ def find_s(*, reduce=max, envelope="sdf"):
 
     all_bigrams = defaultdict(int)
     for dictfile in options.dict or []:
-        this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding)
+        this_bigrams = ngrams.extract_ngrams_from_file(
+            dictfile, 2, cutoff=cutoff, encoding=encoding
+        )
         for k, v in this_bigrams.items():
             all_bigrams[k] += v
     for bigram in all_bigrams:

diff --git a/kern_triples.py b/kern_triples.py
@@ -69,10 +69,14 @@ def create_blurred_surface_for_text(text):
     all_bigrams = defaultdict(int)
     all_trigrams = defaultdict(int)
     for dictfile in dictfiles:
-        this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding)
+        this_bigrams = ngrams.extract_ngrams_from_file(
+            dictfile, 2, cutoff=cutoff, encoding=encoding
+        )
         for k, v in this_bigrams.items():
             all_bigrams[k] += v
-        this_trigrams = ngrams.extract_ngrams_from_file(dictfile, 3, cutoff=cutoff, encoding=encoding)
+        this_trigrams = ngrams.extract_ngrams_from_file(
+            dictfile, 3, cutoff=cutoff, encoding=encoding
+        )
         for k, v in this_trigrams.items():
             all_trigrams[k] += v
 

diff --git a/ngrams.py b/ngrams.py
@@ -5,7 +5,9 @@
 LETTERS_ONLY = False
 
 
-def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8"):
+def extract_ngrams(
+    text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8"
+):
     if frequencies is None:
         frequencies = itertools.cycle([min_freq])
 
@@ -28,7 +30,6 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ
         words = [word[i:] for i in range(n)]
 
         for ngram in zip(*words):
-
             ngram = "".join(ngram)
             ngrams[ngram] += freq
 
@@ -52,17 +53,34 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ
 
 
 def extract_ngrams_from_file(filename, *kargs, **kwargs):
+    frqfile = None
     try:
         txtfile = open(filename, "rb")
-        # Assume hunspell dictionary format; drop everything after "/"
-        txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
-        frqfile = None
     except FileNotFoundError:
-        import bz2
-
-        # Assume harfbuzz-testing-wikipedia format
-        txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
-        frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        try:
+            import bz2
+
+            # Assume harfbuzz-testing-wikipedia format
+            txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
+            frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        except FileNotFoundError:
+            try:
+                # Assume hunspell dictionary format;
+                afffile = open(filename + ".aff", "rb")
+                for line in afffile:
+                    if line.startswith(b"SET"):
+                        kwargs["encoding"] = (
+                            line.replace(b"\t", b" ").split()[1].decode("ascii")
+                        )
+                        break
+                txtfile = open(filename + ".dic", "rb")
+                next(txtfile)  # Skip over the num entries line
+                txtfile = (
+                    s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile
+                )
+
+            except FileNotFoundError:
+                raise FileNotFoundError("File not found: %s" % filename)
 
     return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)