Skip to content

Commit

Permalink
Determine unique and most common ngrams for every language (#235)
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed Sep 6, 2024
1 parent ef28e8b commit 3030bab
Show file tree
Hide file tree
Showing 706 changed files with 949 additions and 170 deletions.
169 changes: 113 additions & 56 deletions lingua/_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,127 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import brotli
import json
import math
import regex

from collections import Counter, defaultdict, OrderedDict
from dataclasses import dataclass
from enum import Enum, auto
from fractions import Fraction
from typing import Any, Counter as TypedCounter, Dict, List, Optional
from math import log
from pathlib import Path
from typing import Any, Counter as TypedCounter, Dict, FrozenSet, List, Optional

from .language import Language
from ._ngram import _NgramRange
from ._ngram import _NgramRange, _get_ngram_name_by_length


class _LinguaJSONEncoder(json.JSONEncoder):
class _NgramProbabilitiesJSONEncoder(json.JSONEncoder):
def default(self, obj: Any) -> Any:
if isinstance(obj, _JSONLanguageModel):
return {"language": obj.language.name, "ngrams": obj.ngrams}
if isinstance(obj, _TrainingDataLanguageModel):
language = obj.language.name
ngrams = self.encode_frequencies(obj.relative_frequencies)
return {"language": language, "ngrams": ngrams}
return json.JSONEncoder.default(self, obj)

def encode_frequencies(self, obj: Optional[Dict[str, Fraction]]) -> Dict[str, str]:
fractions_to_ngrams = defaultdict(list)
if obj is not None:
for ngram, fraction in obj.items():
fractions_to_ngrams[fraction].append(ngram)

fractions_to_joined_ngrams = OrderedDict()
for fraction, ngrams in fractions_to_ngrams.items():
fraction_str = f"{fraction.numerator}/{fraction.denominator}"
fractions_to_joined_ngrams[fraction_str] = " ".join(
sorted(map(lambda n: n, ngrams))
)
return fractions_to_joined_ngrams

class _LinguaJSONDecoder(json.JSONDecoder):

class _NgramProbabilitiesJSONDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)

def object_hook(self, obj: Any) -> Any:
if isinstance(obj, dict) and "language" in obj and "ngrams" in obj:
language = Language[obj["language"]]
ngrams = self.parse_frequencies(obj["ngrams"])
return _NgramProbabilityModel(language, ngrams)
return obj

def parse_frequencies(self, obj: Dict[str, str]) -> Dict[str, float]:
frequencies = {}
for fraction, ngrams in obj.items():
numerator, denominator = fraction.split("/")
frequency = log(int(numerator) / int(denominator))
for ngram in ngrams.split(" "):
frequencies[ngram] = frequency
return frequencies


class _NgramsJSONDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)

def object_hook(self, obj: Any) -> Any:
if isinstance(obj, dict) and "language" in obj and "ngrams" in obj:
language = Language[obj["language"]]
ngrams = self.object_hook(obj["ngrams"])
return _JSONLanguageModel(language, ngrams)
return _NgramModel(language, frozenset(ngrams))
return obj


@dataclass
class _JSONLanguageModel:
class _NgramProbabilityModel:
language: Language
ngrams: Dict[str, float]


@dataclass
class _NgramModel:
language: Language
ngrams: Dict[str, str]
ngrams: FrozenSet[str]


class _NgramModelType(Enum):
UNIQUE = auto()
MOSTCOMMON = auto()


def _load_ngram_probability_model(
language: Language, ngram_length: int
) -> Optional[_NgramProbabilityModel]:
ngram_name = _get_ngram_name_by_length(ngram_length)
iso_code = language.iso_code_639_1.name.lower()
relative_file_path = f"./language-models/{iso_code}/{ngram_name}s.json.br"
absolute_file_path = Path(__file__).parent / relative_file_path
try:
with open(absolute_file_path, mode="rb") as ngrams_file:
ngrams_json = brotli.decompress(ngrams_file.read()).decode("utf-8")
return json.loads(ngrams_json, cls=_NgramProbabilitiesJSONDecoder)
except FileNotFoundError:
return None


def _load_ngram_model(
language: Language, ngram_length: int, model_type: _NgramModelType
) -> Optional[_NgramModel]:
ngram_name = _get_ngram_name_by_length(ngram_length)
iso_code = language.iso_code_639_1.name.lower()
relative_file_path = (
f"./language-models/{iso_code}/{model_type.name.lower()}_{ngram_name}s.json.br"
)
absolute_file_path = Path(__file__).parent / relative_file_path
try:
with open(absolute_file_path, mode="rb") as unique_ngrams_file:
unique_ngrams_json = brotli.decompress(unique_ngrams_file.read()).decode(
"utf-8"
)
return json.loads(unique_ngrams_json, cls=_NgramsJSONDecoder)
except FileNotFoundError:
return None


@dataclass
Expand Down Expand Up @@ -78,36 +163,10 @@ def from_text(
relative_frequencies=relative_frequencies,
)

@classmethod
def from_json(cls, serialized_json: str) -> Dict[str, float]:
json_language_model: _JSONLanguageModel = json.loads(
serialized_json, cls=_LinguaJSONDecoder
)
frequencies = {}

for fraction, ngrams in json_language_model.ngrams.items():
numerator, denominator = fraction.split("/")
frequency = math.log(int(numerator) / int(denominator))
for ngram in ngrams.split(" "):
frequencies[ngram] = frequency

return frequencies

def to_json(self) -> str:
fractions_to_ngrams = defaultdict(list)
if self.relative_frequencies is not None:
for ngram, fraction in self.relative_frequencies.items():
fractions_to_ngrams[fraction].append(ngram)

fractions_to_joined_ngrams = OrderedDict()
for fraction, ngrams in fractions_to_ngrams.items():
fraction_str = f"{fraction.numerator}/{fraction.denominator}"
fractions_to_joined_ngrams[fraction_str] = " ".join(
sorted(map(lambda n: n, ngrams))
)

model = _JSONLanguageModel(self.language, fractions_to_joined_ngrams)
return regex.sub(r"([:,])\s*", r"\1", json.dumps(model, cls=_LinguaJSONEncoder))
return regex.sub(
r"([:,])\s*", r"\1", json.dumps(self, cls=_NgramProbabilitiesJSONEncoder)
)

@classmethod
def compute_absolute_frequencies(
Expand Down Expand Up @@ -144,21 +203,19 @@ def compute_relative_frequencies(
return ngram_probabilities


@dataclass
class _TestDataLanguageModel:
ngrams: List[List[str]]
def _create_ngrams(words: List[str], ngram_length: int) -> FrozenSet[str]:
if ngram_length not in range(1, 6):
raise ValueError(f"ngram length {ngram_length} is not in range 1..6")
ngrams = set()
for word in words:
chars_count = len(word)
if chars_count >= ngram_length:
for i in range(0, chars_count - ngram_length + 1):
substr = word[i : i + ngram_length]
ngrams.add(substr)
return frozenset(ngrams)

@classmethod
def from_text(cls, words: List[str], ngram_length: int) -> "_TestDataLanguageModel":
if ngram_length not in range(1, 6):
raise ValueError(f"ngram length {ngram_length} is not in range 1..6")
ngrams = set()
for word in words:
chars_count = len(word)
if chars_count >= ngram_length:
for i in range(0, chars_count - ngram_length + 1):
substr = word[i : i + ngram_length]
ngrams.add(substr)

lower_order_ngrams = [list(_NgramRange(ngram)) for ngram in ngrams]
return _TestDataLanguageModel(lower_order_ngrams)

def _create_lower_order_ngrams(words: List[str], ngram_length: int) -> List[List[str]]:
ngrams = _create_ngrams(words, ngram_length)
return [list(_NgramRange(ngram)) for ngram in ngrams]
17 changes: 10 additions & 7 deletions lingua/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .language import Language

_MISSING_LANGUAGE_MESSAGE: str = (
"LanguageDetector needs at least 2 languages to choose from"
"LanguageDetector needs at least 1 language to choose from"
)


Expand Down Expand Up @@ -93,16 +93,19 @@ def from_all_languages_without(
with all built-in languages except those passed to this method.
"""
languages_to_load = Language.all().difference(languages)
if len(languages_to_load) < 2:
if len(languages_to_load) == 0:
raise ValueError(_MISSING_LANGUAGE_MESSAGE)
return cls._from(languages_to_load)

@classmethod
def from_languages(cls, *languages: Language) -> "LanguageDetectorBuilder":
"""Create and return an instance of LanguageDetectorBuilder
with the languages passed to this method.
Raises:
ValueError: if no language is specified
"""
if len(languages) < 2:
if len(languages) == 0:
raise ValueError(_MISSING_LANGUAGE_MESSAGE)
return cls._from(languages)

Expand All @@ -115,9 +118,9 @@ def from_iso_codes_639_1(
to this method.
Raises:
ValueError: if less than two ISO codes are specified
ValueError: if no ISO code is specified
"""
if len(iso_codes) < 2:
if len(iso_codes) == 0:
raise ValueError(_MISSING_LANGUAGE_MESSAGE)
languages = set()
for iso_code in iso_codes:
Expand All @@ -134,9 +137,9 @@ def from_iso_codes_639_3(
to this method.
Raises:
ValueError: if less than two ISO codes are specified
ValueError: if no ISO code is specified
"""
if len(iso_codes) < 2:
if len(iso_codes) == 0:
raise ValueError(_MISSING_LANGUAGE_MESSAGE)
languages = set()
for iso_code in iso_codes:
Expand Down
Loading

0 comments on commit 3030bab

Please sign in to comment.