Do tokens gooder

TI-Toolkit · Jan 3, 2025 · 4aa2fdc · 4aa2fdc
1 parent 17aa6d7
commit 4aa2fdc
Show file tree

Hide file tree

Showing 12 changed files with 219 additions and 122 deletions.
diff --git a/tests/tivars.py b/tests/tivars.py
@@ -170,9 +170,11 @@ def test_load_from_string(self):
 
         test_program.load_string(string := "setDate(1")
         self.assertEqual(test_program.string(), string)
+        self.assertEqual(f"{test_program:a}", string)
         self.assertEqual(f"{test_program:02d: }", f"00: {string}")
-        self.assertEqual(test_program.tokens(), [TI_84PCE.tokens.bytes[b'\xef\x00'],
-                                                 TI_84PCE.tokens.bytes[b'1']])
+
+        self.assertEqual(test_program.tokens(), [TI_84PCE.tokens["setDate("],
+                                                 TI_84PCE.tokens[b'1']])
 
         # Version is wrong(?)
         test_program.version = 0x04

diff --git a/tivars/__init__.py b/tivars/__init__.py
@@ -4,10 +4,10 @@
 
 
 from .flash import *
+from .models import *
 from .tokenizer import *
 from .types import *
-from .models import *
 from .var import *
 
 
-__all__ = list({*flash.__all__, *tokenizer.__all__, *types.__all__, *models.__all__, *var.__all__})
+__all__ = list({*flash.__all__, *models.__all__, *tokenizer.__all__, *types.__all__, *var.__all__})
diff --git a/tivars/models/model.py b/tivars/models/model.py
@@ -8,8 +8,8 @@
 from functools import total_ordering
 
 from tivars.flags import *
-from tivars.tokens.scripts import OsVersion, Tokens, TokenTrie
-from tivars.tokens.scripts.parse import MODEL_ORDER
+from tivars.tokens.scripts.parse import MODEL_ORDER, OsVersion, Tokens
+from tivars.trie import *
 
 
 @total_ordering
@@ -29,20 +29,14 @@ class TIModel:
     """
 
     def __init__(self, name: str, features: 'TIFeature', magic: str, product_id: int, lang: str):
-        self._name = name
-        self._features = TIFeature(features)
-        self._magic = magic
-        self._product_id = product_id
-        self._lang = lang
+        self.name = name
+        self.features = TIFeature(features)
+        self.magic = magic
+        self.product_id = product_id
+        self.lang = lang
 
         with open(os.path.join(os.path.dirname(__file__), "../tokens/8X.xml"), encoding="UTF-8") as file:
-            self._tokens = Tokens.from_xml_string(file.read(), self.OS("latest"))
-
-        self._trie = {}
-        for lang in self._tokens.langs:
-            self._trie[lang] = TokenTrie.from_tokens(self._tokens, lang)
-
-        self._trie[None] = self._trie["en"]
+            self.tokens = TITokens(Tokens.from_xml_string(file.read(), self.OS("latest")))
 
     def __eq__(self, other):
         return str(self) == str(other)
@@ -56,71 +50,13 @@ def __hash__(self):
     def __str__(self):
         return self.name
 
-    @property
-    def features(self) -> 'TIFeature':
-        """
-        :return: This model's features
-        """
-
-        return self._features
-
-    @property
-    def lang(self) -> str:
-        """
-        :return: This model's native language
-        """
-
-        return self._lang
-
-    @property
-    def magic(self) -> str:
-        """
-        :return: This model's file magic
-        """
-
-        return self._magic
-
-    @property
-    def name(self) -> str:
-        """
-        :return: This model's (abbreviated) name
-        """
-
-        return self._name
-
     @property
     def order(self) -> int:
         """
         :return: This model's order within the chronology used by the token sheets
         """
 
-        return MODEL_ORDER[self._name]
-
-    @property
-    def product_id(self) -> int:
-        """
-        :return: This model's product ID
-        """
-
-        return self._product_id
-
-    @property
-    def tokens(self) -> Tokens:
-        """
-        :return: The tokens supported by this model
-        """
-
-        return self._tokens
-
-    def get_trie(self, lang: str = None) -> TokenTrie:
-        """
-        Gets the token trie for this model corresponding to a given language
-
-        :param lang: A language code (defaults to English, ``en``)
-        :return: The token trie corresponding to ``lang``
-        """
-
-        return self._trie[lang]
+        return MODEL_ORDER[self.name]
 
     def has(self, feature: 'TIFeature'):
         """
@@ -130,7 +66,7 @@ def has(self, feature: 'TIFeature'):
         :return: Whether this model has ``feature``
         """
 
-        return feature in self._features
+        return feature in self.features
 
     def OS(self, version: str = "") -> OsVersion:
         """

diff --git a/tivars/models/versions.py b/tivars/models/versions.py
@@ -3,11 +3,11 @@
 """
 
 
-import tivars.tokens.scripts.parse as parse
+import tivars.tokens.scripts as tokens
 from .model import *
 
 
-class OsVersions(parse.OsVersions):
+class OsVersions(tokens.OsVersions):
     """
     Namespace containing useful OS versions
 

diff --git a/tivars/token.py b/tivars/token.py
@@ -0,0 +1,44 @@
+from tivars.tokens.scripts import *
+
+
+class TIToken(Token):
+    """
+    Interface extension for the token sheets base ``Token`` container
+
+    TITokens can be fetched by bytes or recognized names from a `TITokens` container attached to a `TIModel`.
+    Instantiating your own `TIToken` is not recommended.
+    """
+
+    def __init__(self, token: Token):
+        super().__init__(token.bits, token.langs, token.attrs, token.since, token.until)
+
+        self.translation = self.langs[None] = self.langs["en"]
+
+    def __repr__(self) -> str:
+        return f"<{self.display} ({self.escape})>"
+
+    @property
+    def accessible(self) -> str:
+        return self.translation.accessible
+
+    @property
+    def display(self) -> str:
+        return self.translation.display
+
+    @property
+    def escape(self) -> str:
+        return rf"\{'x' if len(self.bits) == 1 else 'u'}{self.bits.hex()}"
+
+    def names(self) -> list[str]:
+        return self.translation.names()
+
+
+class IllegalToken(TIToken):
+    def __init__(self, bits: bytes):
+        self.bits = bits
+
+        super().__init__(Token(bits, {"en": Translation(b'?', "?", self.escape, [])},
+                               {"illegal": "true"}))
+
+
+__all__ = ["TIToken", "IllegalToken"]
diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py
@@ -7,7 +7,9 @@
 
 from tivars.data import String
 from tivars.models import *
+from tivars.token import *
 from tivars.tokens.scripts import *
+from tivars.trie import *
 from .decoder import *
 from .encoder import *
 
@@ -41,6 +43,7 @@ class Name(TokenizedString):
 
     @classmethod
     def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
+        # Is this necessary?
         mode = "max" if instance is not None and instance.leading_name_byte else "string"
         data = encode(value, mode=mode)[0].rstrip(b'\x00')
 
@@ -52,4 +55,4 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
 
 
 __all__ = ["decode", "encode", "normalize", "Name", "TokenizedString",
-           "Token", "Tokens", "OsVersion", "OsVersions"]
+           "TIToken", "IllegalToken", "TITokenTrie", "TITokens", "OsVersion", "OsVersions"]
diff --git a/tivars/tokenizer/decoder.py b/tivars/tokenizer/decoder.py
@@ -6,27 +6,22 @@
 from warnings import warn
 
 from tivars.models import *
-from tivars.tokens.scripts import *
+from tivars.token import *
+from tivars.trie import *
 
 
-def invalid_token(bits: bytes) -> Token:
-    name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}"
-    return Token(bits, {"en": Translation(b'?', "?", name, [])})
-
-
-
-def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]:
+def decode(bytestream: bytes, *, tokens: TITokens = None) -> tuple[list[TIToken], OsVersion]:
     """
-    Decodes a byte stream into a list of `Token` objects and its minimum supported OS version
+    Decodes a byte stream into a list of `TIToken` objects and its minimum supported OS version
 
     Each token is represented using one of three different representations formats, dictated by ``mode``:
         - ``display``: Represents the tokens with Unicode characters matching the calculator's display
         - ``accessible``: Represents the tokens with ASCII-only equivalents, often requiring multi-character glyphs
         - ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object)
 
     :param bytestream: The token bytes to decode
-    :param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens)
-    :return: A tuple of a list of `Token` objects and a minimum `OsVersion`
+    :param tokens: The `TITokens` object to use for decoding (defaults to the TI-84+CE tokens)
+    :return: A tuple of a list of `TIToken` objects and a minimum `OsVersion`
     """
 
     tokens = tokens or TI_84PCE.tokens
@@ -51,15 +46,15 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os
                 warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.",
                      BytesWarning)
 
-                out.append(invalid_token(curr_bytes))
+                out.append(IllegalToken(curr_bytes))
                 curr_bytes = b''
 
         elif curr_bytes[-1]:
             count = 0
             while not curr_bytes[0]:
                 curr_bytes = curr_bytes[1:]
                 count += 1
-                out.append(invalid_token(b'\x00'))
+                out.append(IllegalToken(b'\x00'))
 
             warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else
                  f"There is an unexpected null byte at position {index}.",

diff --git a/tivars/tokenizer/encoder.py b/tivars/tokenizer/encoder.py
@@ -7,12 +7,12 @@
 import unicodedata
 
 from tivars.models import *
-from tivars.tokens.scripts import *
+from tivars.trie import *
 from .state import *
 
 
 def encode(string: str, *,
-           trie: TokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
+           trie: TITokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
     """
     Encodes a string of tokens represented as text into a byte stream and its minimum supported OS version
 
@@ -43,7 +43,7 @@ def encode(string: str, *,
     """
 
     string = _normalize(string) if normalize else string
-    trie = trie or TI_84PCE.get_trie()
+    trie = trie or TI_84PCE.tokens.tries[None]
     mode = mode or "smart"
 
     data = b''

diff --git a/tivars/tokenizer/state.py b/tivars/tokenizer/state.py
@@ -5,7 +5,8 @@
 
 from string import punctuation
 
-from tivars.tokens.scripts import *
+from tivars.token import *
+from tivars.trie import *
 
 
 class EncoderState:
@@ -28,7 +29,7 @@ class EncoderState:
     def __init__(self, length: int = 0):
         self.length = length
 
-    def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['EncoderState']]:
+    def munch(self, string: str, trie: TITokenTrie) -> tuple[TIToken, str, list['EncoderState']]:
         """
         Munch the input string and determine the resulting token, encoder state, and remainder of the string
 
@@ -41,13 +42,11 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder
         if string.startswith(r"\x") or string.startswith(r"\u"):
             length = 4 if string.startswith(r"\x") else 6
             string, remainder = string[:length], string[length:]
-            token = Token(bytes.fromhex(string.lstrip(r"\ux")),
-                          {"en": Translation(b'?', string, string, [])},
-                          {"illegal": "true"})
+            token = IllegalToken(bytes.fromhex(string.lstrip(r"\ux")))
 
             return token, remainder, self.next(token)
 
-        tokens = trie.get_tokens(string)
+        tokens = trie.match(string)
         if not tokens:
             raise ValueError("no tokenization options exist")
 
@@ -63,7 +62,7 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder
 
         return token, remainder, self.next(token)
 
-    def next(self, token: Token) -> list['EncoderState']:
+    def next(self, token: TIToken) -> list['EncoderState']:
         """
         Determines the next encode state given a token
 
@@ -102,7 +101,7 @@ class Line(EncoderState):
     Encoder state which is always exited after a line break
     """
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x04' | b'\x3F':
                 return []
@@ -118,7 +117,7 @@ class Name(Line):
 
     mode = -1
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         #  Digits                              Uppercase letters (and theta)
         if b'\x30' <= token.bits <= b'\x39' or b'\x41' <= token.bits <= b'\x5B':
             return super().next(token)
@@ -150,7 +149,7 @@ class String(Line):
 
     mode = -1
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return []
@@ -176,7 +175,7 @@ class InterpolationStart(Line):
 
     mode = 0
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return [InterpolatedString()]
@@ -192,7 +191,7 @@ class SmartMode(EncoderState):
 
     mode = 0
 
-    def next(self, token: Token) -> list[EncoderState]:
+    def next(self, token: TIToken) -> list[EncoderState]:
         match token.bits:
             case b'\x2A':
                 return [self, String()]