From 4aa2fdc135d3b0e80460dfe1bbd77544c0a0061f Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Fri, 3 Jan 2025 02:26:19 -0500
Subject: [PATCH] Do tokens gooder

---
 tests/tivars.py              |   6 +-
 tivars/__init__.py           |   4 +-
 tivars/models/model.py       |  84 ++++----------------------
 tivars/models/versions.py    |   4 +-
 tivars/token.py              |  44 ++++++++++++++
 tivars/tokenizer/__init__.py |   5 +-
 tivars/tokenizer/decoder.py  |  21 +++----
 tivars/tokenizer/encoder.py  |   6 +-
 tivars/tokenizer/state.py    |  23 ++++---
 tivars/trie.py               | 113 +++++++++++++++++++++++++++++++++++
 tivars/types/gdb.py          |   1 -
 tivars/types/tokenized.py    |  30 ++++++----
 12 files changed, 219 insertions(+), 122 deletions(-)
 create mode 100644 tivars/token.py
 create mode 100644 tivars/trie.py

diff --git a/tests/tivars.py b/tests/tivars.py
index c0a95c1..e5bd31b 100644
--- a/tests/tivars.py
+++ b/tests/tivars.py
@@ -170,9 +170,11 @@ def test_load_from_string(self):
 
         test_program.load_string(string := "setDate(1")
         self.assertEqual(test_program.string(), string)
+        self.assertEqual(f"{test_program:a}", string)
         self.assertEqual(f"{test_program:02d: }", f"00: {string}")
-        self.assertEqual(test_program.tokens(), [TI_84PCE.tokens.bytes[b'\xef\x00'],
-                                                 TI_84PCE.tokens.bytes[b'1']])
+
+        self.assertEqual(test_program.tokens(), [TI_84PCE.tokens["setDate("],
+                                                 TI_84PCE.tokens[b'1']])
 
         # Version is wrong(?)
         test_program.version = 0x04
diff --git a/tivars/__init__.py b/tivars/__init__.py
index deca750..a60fa16 100644
--- a/tivars/__init__.py
+++ b/tivars/__init__.py
@@ -4,10 +4,10 @@
 
 
 from .flash import *
+from .models import *
 from .tokenizer import *
 from .types import *
-from .models import *
 from .var import *
 
 
-__all__ = list({*flash.__all__, *tokenizer.__all__, *types.__all__, *models.__all__, *var.__all__})
+__all__ = list({*flash.__all__, *models.__all__, *tokenizer.__all__, *types.__all__, *var.__all__})
diff --git a/tivars/models/model.py b/tivars/models/model.py
index 1c0b60c..9298425 100644
--- a/tivars/models/model.py
+++ b/tivars/models/model.py
@@ -8,8 +8,8 @@
 from functools import total_ordering
 
 from tivars.flags import *
-from tivars.tokens.scripts import OsVersion, Tokens, TokenTrie
-from tivars.tokens.scripts.parse import MODEL_ORDER
+from tivars.tokens.scripts.parse import MODEL_ORDER, OsVersion, Tokens
+from tivars.trie import *
 
 
 @total_ordering
@@ -29,20 +29,14 @@ class TIModel:
     """
 
     def __init__(self, name: str, features: 'TIFeature', magic: str, product_id: int, lang: str):
-        self._name = name
-        self._features = TIFeature(features)
-        self._magic = magic
-        self._product_id = product_id
-        self._lang = lang
+        self.name = name
+        self.features = TIFeature(features)
+        self.magic = magic
+        self.product_id = product_id
+        self.lang = lang
 
         with open(os.path.join(os.path.dirname(__file__), "../tokens/8X.xml"), encoding="UTF-8") as file:
-            self._tokens = Tokens.from_xml_string(file.read(), self.OS("latest"))
-
-        self._trie = {}
-        for lang in self._tokens.langs:
-            self._trie[lang] = TokenTrie.from_tokens(self._tokens, lang)
-
-        self._trie[None] = self._trie["en"]
+            self.tokens = TITokens(Tokens.from_xml_string(file.read(), self.OS("latest")))
 
     def __eq__(self, other):
         return str(self) == str(other)
@@ -56,71 +50,13 @@ def __hash__(self):
     def __str__(self):
         return self.name
 
-    @property
-    def features(self) -> 'TIFeature':
-        """
-        :return: This model's features
-        """
-
-        return self._features
-
-    @property
-    def lang(self) -> str:
-        """
-        :return: This model's native language
-        """
-
-        return self._lang
-
-    @property
-    def magic(self) -> str:
-        """
-        :return: This model's file magic
-        """
-
-        return self._magic
-
-    @property
-    def name(self) -> str:
-        """
-        :return: This model's (abbreviated) name
-        """
-
-        return self._name
-
     @property
     def order(self) -> int:
         """
         :return: This model's order within the chronology used by the token sheets
         """
 
-        return MODEL_ORDER[self._name]
-
-    @property
-    def product_id(self) -> int:
-        """
-        :return: This model's product ID
-        """
-
-        return self._product_id
-
-    @property
-    def tokens(self) -> Tokens:
-        """
-        :return: The tokens supported by this model
-        """
-
-        return self._tokens
-
-    def get_trie(self, lang: str = None) -> TokenTrie:
-        """
-        Gets the token trie for this model corresponding to a given language
-
-        :param lang: A language code (defaults to English, ``en``)
-        :return: The token trie corresponding to ``lang``
-        """
-
-        return self._trie[lang]
+        return MODEL_ORDER[self.name]
 
     def has(self, feature: 'TIFeature'):
         """
@@ -130,7 +66,7 @@ def has(self, feature: 'TIFeature'):
         :return: Whether this model has ``feature``
         """
 
-        return feature in self._features
+        return feature in self.features
 
     def OS(self, version: str = "") -> OsVersion:
         """
diff --git a/tivars/models/versions.py b/tivars/models/versions.py
index e7d1fba..85f802a 100644
--- a/tivars/models/versions.py
+++ b/tivars/models/versions.py
@@ -3,11 +3,11 @@
 """
 
 
-import tivars.tokens.scripts.parse as parse
+import tivars.tokens.scripts as tokens
 from .model import *
 
 
-class OsVersions(parse.OsVersions):
+class OsVersions(tokens.OsVersions):
     """
     Namespace containing useful OS versions
 
diff --git a/tivars/token.py b/tivars/token.py
new file mode 100644
index 0000000..0228563
--- /dev/null
+++ b/tivars/token.py
@@ -0,0 +1,44 @@
+from tivars.tokens.scripts import *
+
+
+class TIToken(Token):
+    """
+    Interface extension for the token sheets base ``Token`` container
+
+    TITokens can be fetched by bytes or recognized names from a `TITokens` container attached to a `TIModel`.
+    Instantiating your own `TIToken` is not recommended.
+    """
+
+    def __init__(self, token: Token):
+        super().__init__(token.bits, token.langs, token.attrs, token.since, token.until)
+
+        self.translation = self.langs[None] = self.langs["en"]
+
+    def __repr__(self) -> str:
+        return f"<{self.display} ({self.escape})>"
+
+    @property
+    def accessible(self) -> str:
+        return self.translation.accessible
+
+    @property
+    def display(self) -> str:
+        return self.translation.display
+
+    @property
+    def escape(self) -> str:
+        return rf"\{'x' if len(self.bits) == 1 else 'u'}{self.bits.hex()}"
+
+    def names(self) -> list[str]:
+        return self.translation.names()
+
+
+class IllegalToken(TIToken):
+    def __init__(self, bits: bytes):
+        self.bits = bits
+
+        super().__init__(Token(bits, {"en": Translation(b'?', "?", self.escape, [])},
+                               {"illegal": "true"}))
+
+
+__all__ = ["TIToken", "IllegalToken"]
diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py
index 196434d..96ffe4d 100644
--- a/tivars/tokenizer/__init__.py
+++ b/tivars/tokenizer/__init__.py
@@ -7,7 +7,9 @@
 
 from tivars.data import String
 from tivars.models import *
+from tivars.token import *
 from tivars.tokens.scripts import *
+from tivars.trie import *
 from .decoder import *
 from .encoder import *
 
@@ -41,6 +43,7 @@ class Name(TokenizedString):
 
     @classmethod
     def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
+        # Is this necessary?
         mode = "max" if instance is not None and instance.leading_name_byte else "string"
         data = encode(value, mode=mode)[0].rstrip(b'\x00')
 
@@ -52,4 +55,4 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
 
 
 __all__ = ["decode", "encode", "normalize", "Name", "TokenizedString",
-           "Token", "Tokens", "OsVersion", "OsVersions"]
+           "TIToken", "IllegalToken", "TITokenTrie", "TITokens", "OsVersion", "OsVersions"]
diff --git a/tivars/tokenizer/decoder.py b/tivars/tokenizer/decoder.py
index f1dba3d..dde8034 100644
--- a/tivars/tokenizer/decoder.py
+++ b/tivars/tokenizer/decoder.py
@@ -6,18 +6,13 @@
 from warnings import warn
 
 from tivars.models import *
-from tivars.tokens.scripts import *
+from tivars.token import *
+from tivars.trie import *
 
 
-def invalid_token(bits: bytes) -> Token:
-    name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}"
-    return Token(bits, {"en": Translation(b'?', "?", name, [])})
-
-
-
-def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]:
+def decode(bytestream: bytes, *, tokens: TITokens = None) -> tuple[list[TIToken], OsVersion]:
     """
-    Decodes a byte stream into a list of `Token` objects and its minimum supported OS version
+    Decodes a byte stream into a list of `TIToken` objects and its minimum supported OS version
 
     Each token is represented using one of three different representations formats, dictated by ``mode``:
         - ``display``: Represents the tokens with Unicode characters matching the calculator's display
@@ -25,8 +20,8 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os
         - ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object)
 
     :param bytestream: The token bytes to decode
-    :param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens)
-    :return: A tuple of a list of `Token` objects and a minimum `OsVersion`
+    :param tokens: The `TITokens` object to use for decoding (defaults to the TI-84+CE tokens)
+    :return: A tuple of a list of `TIToken` objects and a minimum `OsVersion`
     """
 
     tokens = tokens or TI_84PCE.tokens
@@ -51,7 +46,7 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os
                 warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.",
                      BytesWarning)
 
-                out.append(invalid_token(curr_bytes))
+                out.append(IllegalToken(curr_bytes))
                 curr_bytes = b''
 
         elif curr_bytes[-1]:
@@ -59,7 +54,7 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os
             while not curr_bytes[0]:
                 curr_bytes = curr_bytes[1:]
                 count += 1
-                out.append(invalid_token(b'\x00'))
+                out.append(IllegalToken(b'\x00'))
 
             warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else
                  f"There is an unexpected null byte at position {index}.",
diff --git a/tivars/tokenizer/encoder.py b/tivars/tokenizer/encoder.py
index 6124553..22393f8 100644
--- a/tivars/tokenizer/encoder.py
+++ b/tivars/tokenizer/encoder.py
@@ -7,12 +7,12 @@
 import unicodedata
 
 from tivars.models import *
-from tivars.tokens.scripts import *
+from tivars.trie import *
 from .state import *
 
 
 def encode(string: str, *,
-           trie: TokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
+           trie: TITokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
     """
     Encodes a string of tokens represented as text into a byte stream and its minimum supported OS version
 
@@ -43,7 +43,7 @@ def encode(string: str, *,
     """
 
     string = _normalize(string) if normalize else string
-    trie = trie or TI_84PCE.get_trie()
+    trie = trie or TI_84PCE.tokens.tries[None]
     mode = mode or "smart"
 
     data = b''
diff --git a/tivars/tokenizer/state.py b/tivars/tokenizer/state.py
index 9b0f3f5..a14c387 100644
--- a/tivars/tokenizer/state.py
+++ b/tivars/tokenizer/state.py
@@ -5,7 +5,8 @@
 
 from string import punctuation
 
-from tivars.tokens.scripts import *
+from tivars.token import *
+from tivars.trie import *
 
 
 class EncoderState:
@@ -28,7 +29,7 @@ class EncoderState:
     def __init__(self, length: int = 0):
         self.length = length
 
-    def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['EncoderState']]:
+    def munch(self, string: str, trie: TITokenTrie) -> tuple[TIToken, str, list['EncoderState']]:
         """
         Munch the input string and determine the resulting token, encoder state, and remainder of the string
 
@@ -41,13 +42,11 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder
         if string.startswith(r"\x") or string.startswith(r"\u"):
             length = 4 if string.startswith(r"\x") else 6
             string, remainder = string[:length], string[length:]
-            token = Token(bytes.fromhex(string.lstrip(r"\ux")),
-                          {"en": Translation(b'?', string, string, [])},
-                          {"illegal": "true"})
+            token = IllegalToken(bytes.fromhex(string.lstrip(r"\ux")))
 
             return token, remainder, self.next(token)
 
-        tokens = trie.get_tokens(string)
+        tokens = trie.match(string)
         if not tokens:
             raise ValueError("no tokenization options exist")
 
@@ -63,7 +62,7 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder
 
         return token, remainder, self.next(token)
 
-    def next(self, token: Token) -> list['EncoderState']:
+    def next(self, token: TIToken) -> list['EncoderState']:
         """
         Determines the next encode state given a token
 
@@ -102,7 +101,7 @@ class Line(EncoderState):
     Encoder state which is always exited after a line break
     """
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x04' | b'\x3F':
                 return []
@@ -118,7 +117,7 @@ class Name(Line):
 
     mode = -1
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         #  Digits                              Uppercase letters (and theta)
         if b'\x30' <= token.bits <= b'\x39' or b'\x41' <= token.bits <= b'\x5B':
             return super().next(token)
@@ -150,7 +149,7 @@ class String(Line):
 
     mode = -1
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return []
@@ -176,7 +175,7 @@ class InterpolationStart(Line):
 
     mode = 0
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return [InterpolatedString()]
@@ -192,7 +191,7 @@ class SmartMode(EncoderState):
 
     mode = 0
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return [self, String()]
diff --git a/tivars/trie.py b/tivars/trie.py
new file mode 100644
index 0000000..cd37b78
--- /dev/null
+++ b/tivars/trie.py
@@ -0,0 +1,113 @@
+from tivars.tokens.scripts import *
+from .token import TIToken
+
+
+class TITokenTrie:
+    """
+    Trie for tokenizing text based on ``tivars.tokens.scripts.TokenTrie``
+    """
+
+    def __init__(self):
+        self.token = None
+        self.children = {}
+
+    def insert(self, token: TIToken, lang: str = None):
+        """
+        Inserts the names of a `TIToken` into the trie in a given language
+
+        :param token: The token to insert
+        :param lang: The language to insert names from (defaults to English, ``en``)
+        """
+
+        if lang and lang not in token.langs:
+            raise ValueError(f"lang '{lang}' not found or not yet supported")
+
+        for name in token.langs[lang].names() if lang else token.names():
+            current = self
+            for char in name:
+                if char not in current.children:
+                    current.children[char] = self.__class__()
+
+                current = current.children[char]
+
+            current.token = token
+
+    @classmethod
+    def from_tokens(cls, tokens: 'TITokens', lang: str = None):
+        """
+        Inserts all tokens from a `TITokens` container into the trie
+
+        :param tokens: The tokens to insert
+        :param lang: The language to insert names from (defaults to English, ``en``)
+        """
+
+        if lang and lang not in tokens.langs:
+            raise ValueError(f"lang '{lang}' not found or not yet supported")
+
+        root = cls()
+        for token in tokens.bytes.values():
+            root.insert(token, lang)
+
+        return root
+
+    def match(self, string: str) -> list[tuple[TIToken, str]]:
+        """
+        Finds all tokens which can be parsed from a given input string
+
+        Each token is returned with the portion of the input string still remaining.
+        Output is sorted by decreasing length of the consumed input.
+
+        :return: A list of tuples each containing a `TIToken` and its remaining input
+        """
+
+        tokens = []
+
+        if string and string[0] in self.children:
+            tokens += self.children[string[0]].match(string[1:])
+
+        if self.token:
+            tokens.append((self.token, string))
+
+        return tokens
+
+
+class TITokens:
+    """
+    Data class for storing collections of `TIToken` instances based on ``tivars.tokens.scripts.Tokens``
+
+    `TIToken` instances may be obtained from various maps:
+
+        - The byte map is indexed by token bytes.
+        - The lang map is indexed by language code, then token name.
+        - The name map is indexed by token name, regardless of language.
+
+    The byte and name maps may be accessed via `__getitem__`.
+
+    Additionally, a trie map contains a `TITokenTrie` for each language, indexed by language code.
+    """
+
+    def __init__(self, tokens: Tokens):
+        self.bytes = {bits: TIToken(token) for bits, token in tokens.bytes.items()}
+        self.langs = {lang: {name: self.bytes[bits] for name, bits in tokens.langs[lang].items()}
+                      for lang in tokens.langs}
+
+        # Flattened name index (probably won't have any clashes)
+        self.names = {name: token for tokens in self.langs.values() for name, token in tokens.items()}
+
+        # Tries
+        self.tries = {lang: TITokenTrie.from_tokens(self, lang) for lang in self.langs}
+
+        self.langs[None] = self.langs["en"]
+        self.tries[None] = self.tries["en"]
+
+    def __getitem__(self, item: bytes | str) -> TIToken:
+        if isinstance(item, bytes):
+            return self.bytes[item]
+
+        elif isinstance(item, str):
+            return self.names[item]
+
+        raise KeyError(item)
+
+
+__all__ = ["TITokenTrie", "TITokens"]
diff --git a/tivars/types/gdb.py b/tivars/types/gdb.py
index ece235b..8733449 100644
--- a/tivars/types/gdb.py
+++ b/tivars/types/gdb.py
@@ -13,7 +13,6 @@
 from tivars.flags import *
 from tivars.data import *
 from tivars.models import *
-from tivars.tokenizer import decode
 from tivars.var import TIEntry, SizedEntry
 from .real import *
 from .tokenized import TIEquation
diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py
index 42bd57e..f34e517 100644
--- a/tivars/types/tokenized.py
+++ b/tivars/types/tokenized.py
@@ -6,7 +6,7 @@
 import re
 
 from io import BytesIO
-from typing import Sequence
+from typing import Iterator, Sequence
 from warnings import catch_warnings, simplefilter, warn
 
 from tivars.data import *
@@ -66,19 +66,25 @@ def __format__(self, format_spec: str) -> str:
         except (AttributeError, KeyError, TypeError, ValueError):
             return super().__format__(format_spec)
 
+    def __iter__(self) -> Iterator[TIToken]:
+        return iter(self.tokens())
+
     @staticmethod
-    def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str:
+    def decode(data: bytes, *, model: TIModel = None, lang: str = None, mode: str = None) -> str:
         """
         Decodes a byte stream into a string of tokens
 
         :param data: The token bytes to decode
-        :param lang: The language used in ``string`` (defaults to English, ``en``)
+        :param model: A model for which compatibility is ensured (defaults to the TI-84+CE)
+        :param lang: The language used in ``string`` (defaults to the locale of `model`, or English, ``en``)
         :param mode: The form of token representation to use for output (defaults to ``display``)
         :return: A string of token representations
         """
 
         try:
-            return "".join(getattr(token.langs[lang], mode) for token in decode(data)[0])
+            model = model or TI_84PCE
+            return "".join(getattr(token.langs[lang or model.lang], mode or "display")
+                           for token in decode(data, tokens=model.tokens)[0])
 
         except (AttributeError, TypeError):
             raise ValueError(f"unrecognized tokenization mode: '{mode}'")
@@ -91,14 +97,14 @@ def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str =
         For detailed information on tokenization modes, see `tivars.tokenizer.encode`.
 
         :param string: The text string to encode
-        :param model: The model to target when encoding (defaults to no specific model)
-        :param lang: The language used in ``string`` (defaults to English, ``en``)
+        :param model: A model to target when encoding (defaults to no specific model)
+        :param lang: The language used in ``string`` (defaults to the locale of `model`, or English, ``en``)
         :param mode: The tokenization mode to use (defaults to ``smart``)
         :return: A stream of token bytes
         """
 
         model = model or TI_84PCE
-        return encode(string, trie=model.get_trie(lang), mode=mode)[0]
+        return encode(string, trie=model.tokens.tries[lang or model.lang], mode=mode)[0]
 
     def get_min_os(self, data: bytes = None) -> OsVersion:
         return decode(data or self.data)[1]
@@ -171,19 +177,19 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m
 
         self.data = self.encode(string, model=model, lang=lang, mode=mode)
 
-    @Loader[Sequence[Token]]
-    def load_tokens(self, tokens: Sequence[Token]):
+    @Loader[Sequence[TIToken]]
+    def load_tokens(self, tokens: Sequence[TIToken]):
         """
-        Loads this entry from a sequence of `Token` objects
+        Loads this entry from a sequence of `TIToken` objects
 
         :param tokens: The sequence of tokens to load
         """
 
         self.data = b''.join(token.bits for token in tokens)
 
-    def tokens(self) -> list[Token]:
+    def tokens(self) -> list[TIToken]:
         """
-        :return: The tokens comprising this entry as a list of `Token` objects
+        :return: The tokens comprising this entry as a list of `TIToken` objects
         """
 
         return decode(self.data)[0]