Add token accessors and rework name converters

TI-Toolkit · Dec 19, 2024 · 05208f9 · 05208f9
1 parent 5ac72c2
commit 05208f9
Show file tree

Hide file tree

Showing 8 changed files with 81 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -261,7 +261,7 @@ All entry types support string formatting using Python's f-strings.
   - `width`: how many digits to group together *(default: no groups)*
 - Tokenized entries support formatting of their tokens into readable lines: `{line_spec}{sep}{type}{lang}`
   - `line_spec`: format specifier for line numbers *(default: no line numbers)*
-  - `sep`: a string to separate lines and line numbers *(default: none)*
+  - `sep`: a string to separate lines and line numbers *(required for line numbering)*
   - `type`: how to format each token
     - `a`: use accessible names
     - `d`: use display names *(default)*

diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py
@@ -16,14 +16,28 @@ class TokenizedString(String):
     """
     Converter for data sections best interpreted as strings of tokens
 
-    Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens.
+    Tokenization uses the TI-84+CE token sheet.
     """
 
     _T = str
 
     @classmethod
     def get(cls, data: bytes, **kwargs) -> _T:
-        return decode(data.ljust(8, b'\x00'))[0]
+        return "".join(token.langs["en"].display for token in decode(data.ljust(8, b'\x00'))[0])
+
+    @classmethod
+    def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
+        return encode(value)[0].rstrip(b'\x00')
+
+
+class Name(TokenizedString):
+    """
+    Converter for names of vars
+
+    Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens.
+    """
+
+    _T = str
 
     @classmethod
     def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
@@ -37,5 +51,5 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
         return data
 
 
-__all__ = ["decode", "encode", "normalize", "TokenizedString",
-           "Tokens", "OsVersion", "OsVersions"]
+__all__ = ["decode", "encode", "normalize", "Name", "TokenizedString",
+           "Token", "Tokens", "OsVersion", "OsVersions"]
diff --git a/tivars/tokenizer/decoder.py b/tivars/tokenizer/decoder.py
@@ -9,11 +9,15 @@
 from tivars.tokens.scripts import *
 
 
-def decode(bytestream: bytes, *,
-           tokens: Tokens = None, lang: str = "en",
-           mode: str = "display") -> tuple[str | bytes, OsVersion]:
+def invalid_token(bits: bytes) -> Token:
+    name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}"
+    return Token(bits, {"en": Translation(b'?', "?", name, [])})
+
+
+
+def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]:
     """
-    Decodes a byte stream into a string of tokens and its minimum supported OS version
+    Decodes a byte stream into a list of `Token` objects and its minimum supported OS version
 
     Each token is represented using one of three different representations formats, dictated by ``mode``:
         - ``display``: Represents the tokens with Unicode characters matching the calculator's display
@@ -22,18 +26,14 @@ def decode(bytestream: bytes, *,
 
     :param bytestream: The token bytes to decode
     :param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens)
-    :param lang: The language used in ``string`` (defaults to English, ``en``)
-    :param mode: The form of token representation to use for output (defaults to ``display``)
-    :return: A tuple of a string of token representations and a minimum `OsVersion`
+    :return: A tuple of a list of `Token` objects and a minimum `OsVersion`
     """
 
     tokens = tokens or TI_84PCE.tokens
 
     out = []
     since = OsVersions.INITIAL
 
-    byte_attr = mode == "ti_ascii"
-
     index = 0
     curr_bytes = b''
     while index < len(bytestream):
@@ -42,50 +42,35 @@ def decode(bytestream: bytes, *,
 
         if curr_bytes[0]:
             if curr_bytes in tokens.bytes:
-                try:
-                    out.append(getattr(tokens.bytes[curr_bytes].langs[lang], mode))
-
-                except AttributeError:
-                    raise ValueError(f"'{mode}' is not a recognized token representation")
-
-                except KeyError:
-                    raise ValueError(f"'{lang}' is not a recognized language")
-
+                out.append(tokens.bytes[curr_bytes])
                 since = max(tokens.bytes[curr_bytes].since, since)
 
                 curr_bytes = b''
 
             elif len(curr_bytes) >= 2:
-                if not any(key.startswith(curr_bytes[:1]) for key in tokens.bytes):
-                    warn(f"Unrecognized byte '0x{curr_hex}' at position {index}.",
-                         BytesWarning)
-
-                    out.append(b'?' if byte_attr else rf"\x{curr_hex}")
-
-                else:
-                    warn(f"Unrecognized bytes '0x{curr_hex}' at position {index}.",
-                         BytesWarning)
-
-                    out.append(b'?' if byte_attr else rf"\u{curr_hex}")
+                warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.",
+                     BytesWarning)
 
+                out.append(invalid_token(curr_bytes))
                 curr_bytes = b''
 
-        elif any(curr_bytes):
+        elif curr_bytes[-1]:
             count = 0
             while not curr_bytes[0]:
                 curr_bytes = curr_bytes[1:]
                 count += 1
-                out.append(b'?' if byte_attr else r"\x00")
+                out.append(invalid_token(b'\x00'))
 
             warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else
                  f"There is an unexpected null byte at position {index}.",
                  BytesWarning)
 
+            curr_bytes = b''
             index -= 1
 
         index += 1
 
-    return b''.join(out) if byte_attr else "".join(out), since
+    return out, since
 
 
 __all__ = ["decode"]
diff --git a/tivars/types/gdb.py b/tivars/types/gdb.py
@@ -283,7 +283,7 @@ def json_name(self) -> str:
         :return: The name of this equation used in the GDB JSON format
         """
 
-        return decode(self.raw.name, mode="accessible")[0].strip("{}|")
+        return self.decode(self.raw.name, mode="accessible").strip("{}|")
 
     def load_data_section(self, data: BytesIO):
         flag_byte = data.read(1)

diff --git a/tivars/types/list.py b/tivars/types/list.py
@@ -15,7 +15,7 @@
 from .real import RealEntry
 
 
-class ListName(TokenizedString):
+class ListName(Name):
     """
     Converter for the name section of lists
 

diff --git a/tivars/types/picture.py b/tivars/types/picture.py
@@ -11,7 +11,7 @@
 
 from tivars.data import *
 from tivars.models import *
-from tivars.tokenizer import TokenizedString
+from tivars.tokenizer import Name
 from tivars.var import SizedEntry
 
 RGB = tuple[int, int, int]
@@ -331,7 +331,7 @@ def array(self) -> list[list[pixel_type]]:
 
 
 # Workaround until the token sheets are updated
-class ImageName(TokenizedString):
+class ImageName(Name):
     """
     Converter for the name section of images
 

diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py
@@ -6,6 +6,7 @@
 import re
 
 from io import BytesIO
+from typing import Sequence
 from warnings import catch_warnings, simplefilter, warn
 
 from tivars.data import *
@@ -43,9 +44,8 @@ class TokenizedEntry(SizedEntry):
 
     def __format__(self, format_spec: str) -> str:
         try:
-            lines, sep, spec, lang = re.match(r"(.*?[a-z%#])?(\W*)(\w?)\.?(\w+)?", format_spec).groups()
-            line_number = f"{{index:{lines}}}{sep}" if lines else sep
-            lang = lang or "en"
+            lines, sep, spec, lang = re.match(r"(?:(.*?[a-z%#])(\W+))?(\w?)(\.\w+)?$", format_spec).groups()
+            lang = (lang or ".en")[1:]
 
             match spec:
                 case "" | "d":
@@ -57,25 +57,31 @@ def __format__(self, format_spec: str) -> str:
                 case _:
                     raise KeyError
 
-            return "\n".join(line_number.format(index=index) + line for index, line in enumerate(string.split("\n")))
+            if lines:
+                return "\n".join(f"{index:{lines}}{sep}" + line for index, line in enumerate(string.split("\n")))
+
+            else:
+                return string
 
         except (AttributeError, KeyError, TypeError, ValueError):
             return super().__format__(format_spec)
 
     @staticmethod
-    def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | bytes:
+    def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str:
         """
         Decodes a byte stream into a string of tokens
 
-        For detailed information on tokenization modes, see `tivars.tokenizer.decode`.
-
         :param data: The token bytes to decode
         :param lang: The language used in ``string`` (defaults to English, ``en``)
         :param mode: The form of token representation to use for output (defaults to ``display``)
         :return: A string of token representations
         """
 
-        return decode(data, lang=lang, mode=mode)[0]
+        try:
+            return "".join(getattr(token.langs[lang], mode) for token in decode(data)[0])
+
+        except (AttributeError, TypeError):
+            raise ValueError(f"unrecognized tokenization mode: '{mode}'")
 
     @staticmethod
     def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = None) -> bytes:
@@ -165,6 +171,23 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m
 
         self.data = self.encode(string, model=model, lang=lang, mode=mode)
 
+    @Loader[Sequence[Token]]
+    def load_tokens(self, tokens: Sequence[Token]):
+        """
+        Loads this entry from a sequence of `Token` objects
+
+        :param tokens: The sequence of tokens to load
+        """
+
+        self.data = b''.join(token.bits for token in tokens)
+
+    def tokens(self) -> list[Token]:
+        """
+        :return: The tokens comprising this entry as a list of `Token` objects
+        """
+
+        return decode(self.data)[0]
+
 
 class TIEquation(TokenizedEntry, register=True):
     """
@@ -191,7 +214,7 @@ def __init__(self, init=None, *,
 
         super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data)
 
-    @Section(8, TokenizedString)
+    @Section(8, Name)
     def name(self, value) -> str:
         """
         The name of the entry
@@ -245,7 +268,7 @@ def __init__(self, init=None, *,
 
         super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data)
 
-    @Section(8, TokenizedString)
+    @Section(8, Name)
     def name(self, value) -> str:
         """
         The name of the entry
@@ -335,13 +358,16 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m
         super().load_string(string, model=model, lang=lang, mode=mode)
 
     def string(self) -> str:
-        string = super().string()
-
         if not self.is_tokenized:
             warn("ASM programs may not have tokenized data.",
                  UserWarning)
 
-        return string
+            with catch_warnings():
+                simplefilter("ignore")
+                return super().string()
+
+        else:
+            return super().string()
 
     def coerce(self):
         with catch_warnings():

diff --git a/tivars/var.py b/tivars/var.py
@@ -13,7 +13,7 @@
 
 from .data import *
 from .models import *
-from .tokenizer import TokenizedString
+from .tokenizer import Name
 
 
 match version_info[:2]:
@@ -515,7 +515,7 @@ def type_id(self) -> int:
         The type determines how the contents of the data section of the entry are interpreted.
         """
 
-    @Section(8, TokenizedString)
+    @Section(8, Name)
     def name(self) -> str:
         """
         The name of the entry