From 4aa2fdc135d3b0e80460dfe1bbd77544c0a0061f Mon Sep 17 00:00:00 2001 From: KG Date: Fri, 3 Jan 2025 02:26:19 -0500 Subject: [PATCH] Do tokens gooder --- tests/tivars.py | 6 +- tivars/__init__.py | 4 +- tivars/models/model.py | 84 ++++---------------------- tivars/models/versions.py | 4 +- tivars/token.py | 44 ++++++++++++++ tivars/tokenizer/__init__.py | 5 +- tivars/tokenizer/decoder.py | 21 +++---- tivars/tokenizer/encoder.py | 6 +- tivars/tokenizer/state.py | 23 ++++--- tivars/trie.py | 113 +++++++++++++++++++++++++++++++++++ tivars/types/gdb.py | 1 - tivars/types/tokenized.py | 30 ++++++---- 12 files changed, 219 insertions(+), 122 deletions(-) create mode 100644 tivars/token.py create mode 100644 tivars/trie.py diff --git a/tests/tivars.py b/tests/tivars.py index c0a95c1..e5bd31b 100644 --- a/tests/tivars.py +++ b/tests/tivars.py @@ -170,9 +170,11 @@ def test_load_from_string(self): test_program.load_string(string := "setDate(1") self.assertEqual(test_program.string(), string) + self.assertEqual(f"{test_program:a}", string) self.assertEqual(f"{test_program:02d: }", f"00: {string}") - self.assertEqual(test_program.tokens(), [TI_84PCE.tokens.bytes[b'\xef\x00'], - TI_84PCE.tokens.bytes[b'1']]) + + self.assertEqual(test_program.tokens(), [TI_84PCE.tokens["setDate("], + TI_84PCE.tokens[b'1']]) # Version is wrong(?) test_program.version = 0x04 diff --git a/tivars/__init__.py b/tivars/__init__.py index deca750..a60fa16 100644 --- a/tivars/__init__.py +++ b/tivars/__init__.py @@ -4,10 +4,10 @@ from .flash import * +from .models import * from .tokenizer import * from .types import * -from .models import * from .var import * -__all__ = list({*flash.__all__, *tokenizer.__all__, *types.__all__, *models.__all__, *var.__all__}) +__all__ = list({*flash.__all__, *models.__all__, *tokenizer.__all__, *types.__all__, *var.__all__}) diff --git a/tivars/models/model.py b/tivars/models/model.py index 1c0b60c..9298425 100644 --- a/tivars/models/model.py +++ b/tivars/models/model.py @@ -8,8 +8,8 @@ from functools import total_ordering from tivars.flags import * -from tivars.tokens.scripts import OsVersion, Tokens, TokenTrie -from tivars.tokens.scripts.parse import MODEL_ORDER +from tivars.tokens.scripts.parse import MODEL_ORDER, OsVersion, Tokens +from tivars.trie import * @total_ordering @@ -29,20 +29,14 @@ class TIModel: """ def __init__(self, name: str, features: 'TIFeature', magic: str, product_id: int, lang: str): - self._name = name - self._features = TIFeature(features) - self._magic = magic - self._product_id = product_id - self._lang = lang + self.name = name + self.features = TIFeature(features) + self.magic = magic + self.product_id = product_id + self.lang = lang with open(os.path.join(os.path.dirname(__file__), "../tokens/8X.xml"), encoding="UTF-8") as file: - self._tokens = Tokens.from_xml_string(file.read(), self.OS("latest")) - - self._trie = {} - for lang in self._tokens.langs: - self._trie[lang] = TokenTrie.from_tokens(self._tokens, lang) - - self._trie[None] = self._trie["en"] + self.tokens = TITokens(Tokens.from_xml_string(file.read(), self.OS("latest"))) def __eq__(self, other): return str(self) == str(other) @@ -56,71 +50,13 @@ def __hash__(self): def __str__(self): return self.name - @property - def features(self) -> 'TIFeature': - """ - :return: This model's features - """ - - return self._features - - @property - def lang(self) -> str: - """ - :return: This model's native language - """ - - return self._lang - - @property - def magic(self) -> str: - """ - :return: This model's file magic - """ - - return self._magic - - @property - def name(self) -> str: - """ - :return: This model's (abbreviated) name - """ - - return self._name - @property def order(self) -> int: """ :return: This model's order within the chronology used by the token sheets """ - return MODEL_ORDER[self._name] - - @property - def product_id(self) -> int: - """ - :return: This model's product ID - """ - - return self._product_id - - @property - def tokens(self) -> Tokens: - """ - :return: The tokens supported by this model - """ - - return self._tokens - - def get_trie(self, lang: str = None) -> TokenTrie: - """ - Gets the token trie for this model corresponding to a given language - - :param lang: A language code (defaults to English, ``en``) - :return: The token trie corresponding to ``lang`` - """ - - return self._trie[lang] + return MODEL_ORDER[self.name] def has(self, feature: 'TIFeature'): """ @@ -130,7 +66,7 @@ def has(self, feature: 'TIFeature'): :return: Whether this model has ``feature`` """ - return feature in self._features + return feature in self.features def OS(self, version: str = "") -> OsVersion: """ diff --git a/tivars/models/versions.py b/tivars/models/versions.py index e7d1fba..85f802a 100644 --- a/tivars/models/versions.py +++ b/tivars/models/versions.py @@ -3,11 +3,11 @@ """ -import tivars.tokens.scripts.parse as parse +import tivars.tokens.scripts as tokens from .model import * -class OsVersions(parse.OsVersions): +class OsVersions(tokens.OsVersions): """ Namespace containing useful OS versions diff --git a/tivars/token.py b/tivars/token.py new file mode 100644 index 0000000..0228563 --- /dev/null +++ b/tivars/token.py @@ -0,0 +1,44 @@ +from tivars.tokens.scripts import * + + +class TIToken(Token): + """ + Interface extension for the token sheets base ``Token`` container + + TITokens can be fetched by bytes or recognized names from a `TITokens` container attached to a `TIModel`. + Instantiating your own `TIToken` is not recommended. + """ + + def __init__(self, token: Token): + super().__init__(token.bits, token.langs, token.attrs, token.since, token.until) + + self.translation = self.langs[None] = self.langs["en"] + + def __repr__(self) -> str: + return f"<{self.display} ({self.escape})>" + + @property + def accessible(self) -> str: + return self.translation.accessible + + @property + def display(self) -> str: + return self.translation.display + + @property + def escape(self) -> str: + return rf"\{'x' if len(self.bits) == 1 else 'u'}{self.bits.hex()}" + + def names(self) -> list[str]: + return self.translation.names() + + +class IllegalToken(TIToken): + def __init__(self, bits: bytes): + self.bits = bits + + super().__init__(Token(bits, {"en": Translation(b'?', "?", self.escape, [])}, + {"illegal": "true"})) + + +__all__ = ["TIToken", "IllegalToken"] diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py index 196434d..96ffe4d 100644 --- a/tivars/tokenizer/__init__.py +++ b/tivars/tokenizer/__init__.py @@ -7,7 +7,9 @@ from tivars.data import String from tivars.models import * +from tivars.token import * from tivars.tokens.scripts import * +from tivars.trie import * from .decoder import * from .encoder import * @@ -41,6 +43,7 @@ class Name(TokenizedString): @classmethod def set(cls, value: _T, *, instance=None, **kwargs) -> bytes: + # Is this necessary? mode = "max" if instance is not None and instance.leading_name_byte else "string" data = encode(value, mode=mode)[0].rstrip(b'\x00') @@ -52,4 +55,4 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes: __all__ = ["decode", "encode", "normalize", "Name", "TokenizedString", - "Token", "Tokens", "OsVersion", "OsVersions"] + "TIToken", "IllegalToken", "TITokenTrie", "TITokens", "OsVersion", "OsVersions"] diff --git a/tivars/tokenizer/decoder.py b/tivars/tokenizer/decoder.py index f1dba3d..dde8034 100644 --- a/tivars/tokenizer/decoder.py +++ b/tivars/tokenizer/decoder.py @@ -6,18 +6,13 @@ from warnings import warn from tivars.models import * -from tivars.tokens.scripts import * +from tivars.token import * +from tivars.trie import * -def invalid_token(bits: bytes) -> Token: - name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}" - return Token(bits, {"en": Translation(b'?', "?", name, [])}) - - - -def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]: +def decode(bytestream: bytes, *, tokens: TITokens = None) -> tuple[list[TIToken], OsVersion]: """ - Decodes a byte stream into a list of `Token` objects and its minimum supported OS version + Decodes a byte stream into a list of `TIToken` objects and its minimum supported OS version Each token is represented using one of three different representations formats, dictated by ``mode``: - ``display``: Represents the tokens with Unicode characters matching the calculator's display @@ -25,8 +20,8 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os - ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object) :param bytestream: The token bytes to decode - :param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens) - :return: A tuple of a list of `Token` objects and a minimum `OsVersion` + :param tokens: The `TITokens` object to use for decoding (defaults to the TI-84+CE tokens) + :return: A tuple of a list of `TIToken` objects and a minimum `OsVersion` """ tokens = tokens or TI_84PCE.tokens @@ -51,7 +46,7 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.", BytesWarning) - out.append(invalid_token(curr_bytes)) + out.append(IllegalToken(curr_bytes)) curr_bytes = b'' elif curr_bytes[-1]: @@ -59,7 +54,7 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os while not curr_bytes[0]: curr_bytes = curr_bytes[1:] count += 1 - out.append(invalid_token(b'\x00')) + out.append(IllegalToken(b'\x00')) warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else f"There is an unexpected null byte at position {index}.", diff --git a/tivars/tokenizer/encoder.py b/tivars/tokenizer/encoder.py index 6124553..22393f8 100644 --- a/tivars/tokenizer/encoder.py +++ b/tivars/tokenizer/encoder.py @@ -7,12 +7,12 @@ import unicodedata from tivars.models import * -from tivars.tokens.scripts import * +from tivars.trie import * from .state import * def encode(string: str, *, - trie: TokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]: + trie: TITokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]: """ Encodes a string of tokens represented as text into a byte stream and its minimum supported OS version @@ -43,7 +43,7 @@ def encode(string: str, *, """ string = _normalize(string) if normalize else string - trie = trie or TI_84PCE.get_trie() + trie = trie or TI_84PCE.tokens.tries[None] mode = mode or "smart" data = b'' diff --git a/tivars/tokenizer/state.py b/tivars/tokenizer/state.py index 9b0f3f5..a14c387 100644 --- a/tivars/tokenizer/state.py +++ b/tivars/tokenizer/state.py @@ -5,7 +5,8 @@ from string import punctuation -from tivars.tokens.scripts import * +from tivars.token import * +from tivars.trie import * class EncoderState: @@ -28,7 +29,7 @@ class EncoderState: def __init__(self, length: int = 0): self.length = length - def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['EncoderState']]: + def munch(self, string: str, trie: TITokenTrie) -> tuple[TIToken, str, list['EncoderState']]: """ Munch the input string and determine the resulting token, encoder state, and remainder of the string @@ -41,13 +42,11 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder if string.startswith(r"\x") or string.startswith(r"\u"): length = 4 if string.startswith(r"\x") else 6 string, remainder = string[:length], string[length:] - token = Token(bytes.fromhex(string.lstrip(r"\ux")), - {"en": Translation(b'?', string, string, [])}, - {"illegal": "true"}) + token = IllegalToken(bytes.fromhex(string.lstrip(r"\ux"))) return token, remainder, self.next(token) - tokens = trie.get_tokens(string) + tokens = trie.match(string) if not tokens: raise ValueError("no tokenization options exist") @@ -63,7 +62,7 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder return token, remainder, self.next(token) - def next(self, token: Token) -> list['EncoderState']: + def next(self, token: TIToken) -> list['EncoderState']: """ Determines the next encode state given a token @@ -102,7 +101,7 @@ class Line(EncoderState): Encoder state which is always exited after a line break """ - def next(self, token: Token) -> list[EncoderState]: + def next(self, token: TIToken) -> list[EncoderState]: match token.bits: case b'\x04' | b'\x3F': return [] @@ -118,7 +117,7 @@ class Name(Line): mode = -1 - def next(self, token: Token) -> list[EncoderState]: + def next(self, token: TIToken) -> list[EncoderState]: # Digits Uppercase letters (and theta) if b'\x30' <= token.bits <= b'\x39' or b'\x41' <= token.bits <= b'\x5B': return super().next(token) @@ -150,7 +149,7 @@ class String(Line): mode = -1 - def next(self, token: Token) -> list[EncoderState]: + def next(self, token: TIToken) -> list[EncoderState]: match token.bits: case b'\x2A': return [] @@ -176,7 +175,7 @@ class InterpolationStart(Line): mode = 0 - def next(self, token: Token) -> list[EncoderState]: + def next(self, token: TIToken) -> list[EncoderState]: match token.bits: case b'\x2A': return [InterpolatedString()] @@ -192,7 +191,7 @@ class SmartMode(EncoderState): mode = 0 - def next(self, token: Token) -> list[EncoderState]: + def next(self, token: TIToken) -> list[EncoderState]: match token.bits: case b'\x2A': return [self, String()] diff --git a/tivars/trie.py b/tivars/trie.py new file mode 100644 index 0000000..cd37b78 --- /dev/null +++ b/tivars/trie.py @@ -0,0 +1,113 @@ +from tivars.tokens.scripts import * +from .token import TIToken + + +class TITokenTrie: + """ + Trie for tokenizing text based on ``tivars.tokens.scripts.TokenTrie`` + """ + + def __init__(self): + self.token = None + self.children = {} + + def insert(self, token: TIToken, lang: str = None): + """ + Inserts the names of a `TIToken` into the trie in a given language + + :param token: The token to insert + :param lang: The language to insert names from (defaults to English, ``en``) + """ + + if lang and lang not in token.langs: + raise ValueError(f"lang '{lang}' not found or not yet supported") + + for name in token.langs[lang].names() if lang else token.names(): + current = self + for char in name: + if char not in current.children: + current.children[char] = self.__class__() + + current = current.children[char] + + current.token = token + + @classmethod + def from_tokens(cls, tokens: 'TITokens', lang: str = None): + """ + Inserts all tokens from a `TITokens` container into the trie + + :param tokens: The tokens to insert + :param lang: The language to insert names from (defaults to English, ``en``) + """ + + if lang and lang not in tokens.langs: + raise ValueError(f"lang '{lang}' not found or not yet supported") + + root = cls() + for token in tokens.bytes.values(): + root.insert(token, lang) + + return root + + def match(self, string: str) -> list[tuple[TIToken, str]]: + """ + Finds all tokens which can be parsed from a given input string + + Each token is returned with the portion of the input string still remaining. + Output is sorted by decreasing length of the consumed input. + + :return: A list of tuples each containing a `TIToken` and its remaining input + """ + + tokens = [] + + if string and string[0] in self.children: + tokens += self.children[string[0]].match(string[1:]) + + if self.token: + tokens.append((self.token, string)) + + return tokens + + +class TITokens: + """ + Data class for storing collections of `TIToken` instances based on ``tivars.tokens.scripts.Tokens`` + + `TIToken` instances may be obtained from various maps: + + - The byte map is indexed by token bytes. + - The lang map is indexed by language code, then token name. + - The name map is indexed by token name, regardless of language. + + The byte and name maps may be accessed via `__getitem__`. + + Additionally, a trie map contains a `TITokenTrie` for each language, indexed by language code. + """ + + def __init__(self, tokens: Tokens): + self.bytes = {bits: TIToken(token) for bits, token in tokens.bytes.items()} + self.langs = {lang: {name: self.bytes[bits] for name, bits in tokens.langs[lang].items()} + for lang in tokens.langs} + + # Flattened name index (probably won't have any clashes) + self.names = {name: token for tokens in self.langs.values() for name, token in tokens.items()} + + # Tries + self.tries = {lang: TITokenTrie.from_tokens(self, lang) for lang in self.langs} + + self.langs[None] = self.langs["en"] + self.tries[None] = self.tries["en"] + + def __getitem__(self, item: bytes | str) -> TIToken: + if isinstance(item, bytes): + return self.bytes[item] + + elif isinstance(item, str): + return self.names[item] + + raise KeyError(item) + + +__all__ = ["TITokenTrie", "TITokens"] diff --git a/tivars/types/gdb.py b/tivars/types/gdb.py index ece235b..8733449 100644 --- a/tivars/types/gdb.py +++ b/tivars/types/gdb.py @@ -13,7 +13,6 @@ from tivars.flags import * from tivars.data import * from tivars.models import * -from tivars.tokenizer import decode from tivars.var import TIEntry, SizedEntry from .real import * from .tokenized import TIEquation diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py index 42bd57e..f34e517 100644 --- a/tivars/types/tokenized.py +++ b/tivars/types/tokenized.py @@ -6,7 +6,7 @@ import re from io import BytesIO -from typing import Sequence +from typing import Iterator, Sequence from warnings import catch_warnings, simplefilter, warn from tivars.data import * @@ -66,19 +66,25 @@ def __format__(self, format_spec: str) -> str: except (AttributeError, KeyError, TypeError, ValueError): return super().__format__(format_spec) + def __iter__(self) -> Iterator[TIToken]: + return iter(self.tokens()) + @staticmethod - def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str: + def decode(data: bytes, *, model: TIModel = None, lang: str = None, mode: str = None) -> str: """ Decodes a byte stream into a string of tokens :param data: The token bytes to decode - :param lang: The language used in ``string`` (defaults to English, ``en``) + :param model: A model for which compatibility is ensured (defaults to the TI-84+CE) + :param lang: The language used in ``string`` (defaults to the locale of `model`, or English, ``en``) :param mode: The form of token representation to use for output (defaults to ``display``) :return: A string of token representations """ try: - return "".join(getattr(token.langs[lang], mode) for token in decode(data)[0]) + model = model or TI_84PCE + return "".join(getattr(token.langs[lang or model.lang], mode or "display") + for token in decode(data, tokens=model.tokens)[0]) except (AttributeError, TypeError): raise ValueError(f"unrecognized tokenization mode: '{mode}'") @@ -91,14 +97,14 @@ def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = For detailed information on tokenization modes, see `tivars.tokenizer.encode`. :param string: The text string to encode - :param model: The model to target when encoding (defaults to no specific model) - :param lang: The language used in ``string`` (defaults to English, ``en``) + :param model: A model to target when encoding (defaults to no specific model) + :param lang: The language used in ``string`` (defaults to the locale of `model`, or English, ``en``) :param mode: The tokenization mode to use (defaults to ``smart``) :return: A stream of token bytes """ model = model or TI_84PCE - return encode(string, trie=model.get_trie(lang), mode=mode)[0] + return encode(string, trie=model.tokens.tries[lang or model.lang], mode=mode)[0] def get_min_os(self, data: bytes = None) -> OsVersion: return decode(data or self.data)[1] @@ -171,19 +177,19 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m self.data = self.encode(string, model=model, lang=lang, mode=mode) - @Loader[Sequence[Token]] - def load_tokens(self, tokens: Sequence[Token]): + @Loader[Sequence[TIToken]] + def load_tokens(self, tokens: Sequence[TIToken]): """ - Loads this entry from a sequence of `Token` objects + Loads this entry from a sequence of `TIToken` objects :param tokens: The sequence of tokens to load """ self.data = b''.join(token.bits for token in tokens) - def tokens(self) -> list[Token]: + def tokens(self) -> list[TIToken]: """ - :return: The tokens comprising this entry as a list of `Token` objects + :return: The tokens comprising this entry as a list of `TIToken` objects """ return decode(self.data)[0]