From 9d1e9bf397d912854c8bff9e05cda5d84d29b0be Mon Sep 17 00:00:00 2001
From: Behdad Esfahbod <behdad@behdad.org>
Date: Fri, 10 Nov 2023 10:08:07 -0700
Subject: [PATCH] Load encoding from libreoffice dictionaries

Based on https://github.com/behdad/halfkern/issues/4
---
 ngrams.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/ngrams.py b/ngrams.py
index 7e8b8d0..606f971 100644
--- a/ngrams.py
+++ b/ngrams.py
@@ -52,17 +52,30 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ
 
 
 def extract_ngrams_from_file(filename, *kargs, **kwargs):
+    encoding = 'utf-8'
+    frqfile = None
     try:
         txtfile = open(filename, "rb")
-        # Assume hunspell dictionary format; drop everything after "/"
-        txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
-        frqfile = None
     except FileNotFoundError:
-        import bz2
-
-        # Assume harfbuzz-testing-wikipedia format
-        txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
-        frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        try:
+            import bz2
+
+            # Assume harfbuzz-testing-wikipedia format
+            txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
+            frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        except FileNotFoundError:
+            try:
+                # Assume hunspell dictionary format;
+                afffile = open(filename + ".aff", "rb")
+                for line in afffile:
+                    if line.startswith(b"SET"):
+                        encoding = line.replace(b"\t", b" ").split()[1].decode("ascii")
+                        break
+                txtfile = open(filename + ".dic", "rb")
+                txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
+
+            except FileNotFoundError:
+                raise FileNotFoundError("File not found: %s" % filename)
 
     return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)