Skip to content

Commit

Permalink
Load encoding from libreoffice dictionaries
Browse files Browse the repository at this point in the history
Based on #4
  • Loading branch information
behdad committed Nov 10, 2023
1 parent 5e14c60 commit 7bfd03c
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 14 deletions.
14 changes: 12 additions & 2 deletions kern_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,15 @@ def surface_sum(surface, func=sum):


def kern_pair(
l, r, min_overlap, max_overlap, *, reduce=max, envelope="sdf", blurred=False, half=True
l,
r,
min_overlap,
max_overlap,
*,
reduce=max,
envelope="sdf",
blurred=False,
half=True
):
old_l_surface = l.surface
old_r_surface = r.surface
Expand Down Expand Up @@ -502,7 +510,9 @@ def find_s(*, reduce=max, envelope="sdf"):

all_bigrams = defaultdict(int)
for dictfile in options.dict or []:
this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding)
this_bigrams = ngrams.extract_ngrams_from_file(
dictfile, 2, cutoff=cutoff, encoding=encoding
)
for k, v in this_bigrams.items():
all_bigrams[k] += v
for bigram in all_bigrams:
Expand Down
8 changes: 6 additions & 2 deletions kern_triples.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,14 @@ def create_blurred_surface_for_text(text):
all_bigrams = defaultdict(int)
all_trigrams = defaultdict(int)
for dictfile in dictfiles:
this_bigrams = ngrams.extract_ngrams_from_file(dictfile, 2, cutoff=cutoff, encoding=encoding)
this_bigrams = ngrams.extract_ngrams_from_file(
dictfile, 2, cutoff=cutoff, encoding=encoding
)
for k, v in this_bigrams.items():
all_bigrams[k] += v
this_trigrams = ngrams.extract_ngrams_from_file(dictfile, 3, cutoff=cutoff, encoding=encoding)
this_trigrams = ngrams.extract_ngrams_from_file(
dictfile, 3, cutoff=cutoff, encoding=encoding
)
for k, v in this_trigrams.items():
all_trigrams[k] += v

Expand Down
38 changes: 28 additions & 10 deletions ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
LETTERS_ONLY = False


def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8"):
def extract_ngrams(
text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ, encoding="utf-8"
):
if frequencies is None:
frequencies = itertools.cycle([min_freq])

Expand All @@ -28,7 +30,6 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ
words = [word[i:] for i in range(n)]

for ngram in zip(*words):

ngram = "".join(ngram)
ngrams[ngram] += freq

Expand All @@ -52,17 +53,34 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ


def extract_ngrams_from_file(filename, *kargs, **kwargs):
frqfile = None
try:
txtfile = open(filename, "rb")
# Assume hunspell dictionary format; drop everything after "/"
txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
frqfile = None
except FileNotFoundError:
import bz2

# Assume harfbuzz-testing-wikipedia format
txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
try:
import bz2

# Assume harfbuzz-testing-wikipedia format
txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
except FileNotFoundError:
try:
# Assume hunspell dictionary format;
afffile = open(filename + ".aff", "rb")
for line in afffile:
if line.startswith(b"SET"):
kwargs["encoding"] = (
line.replace(b"\t", b" ").split()[1].decode("ascii")
)
break
txtfile = open(filename + ".dic", "rb")
next(txtfile) # Skip over the num entries line
txtfile = (
s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile
)

except FileNotFoundError:
raise FileNotFoundError("File not found: %s" % filename)

return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)

Expand Down

0 comments on commit 7bfd03c

Please sign in to comment.