Skip to content

Commit

Permalink
Do tokens gooder
Browse files Browse the repository at this point in the history
  • Loading branch information
kg583 committed Jan 3, 2025
1 parent 17aa6d7 commit 4aa2fdc
Show file tree
Hide file tree
Showing 12 changed files with 219 additions and 122 deletions.
6 changes: 4 additions & 2 deletions tests/tivars.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,11 @@ def test_load_from_string(self):

test_program.load_string(string := "setDate(1")
self.assertEqual(test_program.string(), string)
self.assertEqual(f"{test_program:a}", string)
self.assertEqual(f"{test_program:02d: }", f"00: {string}")
self.assertEqual(test_program.tokens(), [TI_84PCE.tokens.bytes[b'\xef\x00'],
TI_84PCE.tokens.bytes[b'1']])

self.assertEqual(test_program.tokens(), [TI_84PCE.tokens["setDate("],
TI_84PCE.tokens[b'1']])

# Version is wrong(?)
test_program.version = 0x04
Expand Down
4 changes: 2 additions & 2 deletions tivars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@


from .flash import *
from .models import *
from .tokenizer import *
from .types import *
from .models import *
from .var import *


__all__ = list({*flash.__all__, *tokenizer.__all__, *types.__all__, *models.__all__, *var.__all__})
__all__ = list({*flash.__all__, *models.__all__, *tokenizer.__all__, *types.__all__, *var.__all__})
84 changes: 10 additions & 74 deletions tivars/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from functools import total_ordering

from tivars.flags import *
from tivars.tokens.scripts import OsVersion, Tokens, TokenTrie
from tivars.tokens.scripts.parse import MODEL_ORDER
from tivars.tokens.scripts.parse import MODEL_ORDER, OsVersion, Tokens
from tivars.trie import *


@total_ordering
Expand All @@ -29,20 +29,14 @@ class TIModel:
"""

def __init__(self, name: str, features: 'TIFeature', magic: str, product_id: int, lang: str):
self._name = name
self._features = TIFeature(features)
self._magic = magic
self._product_id = product_id
self._lang = lang
self.name = name
self.features = TIFeature(features)
self.magic = magic
self.product_id = product_id
self.lang = lang

with open(os.path.join(os.path.dirname(__file__), "../tokens/8X.xml"), encoding="UTF-8") as file:
self._tokens = Tokens.from_xml_string(file.read(), self.OS("latest"))

self._trie = {}
for lang in self._tokens.langs:
self._trie[lang] = TokenTrie.from_tokens(self._tokens, lang)

self._trie[None] = self._trie["en"]
self.tokens = TITokens(Tokens.from_xml_string(file.read(), self.OS("latest")))

def __eq__(self, other):
return str(self) == str(other)
Expand All @@ -56,71 +50,13 @@ def __hash__(self):
def __str__(self):
return self.name

@property
def features(self) -> 'TIFeature':
"""
:return: This model's features
"""

return self._features

@property
def lang(self) -> str:
"""
:return: This model's native language
"""

return self._lang

@property
def magic(self) -> str:
"""
:return: This model's file magic
"""

return self._magic

@property
def name(self) -> str:
"""
:return: This model's (abbreviated) name
"""

return self._name

@property
def order(self) -> int:
"""
:return: This model's order within the chronology used by the token sheets
"""

return MODEL_ORDER[self._name]

@property
def product_id(self) -> int:
"""
:return: This model's product ID
"""

return self._product_id

@property
def tokens(self) -> Tokens:
"""
:return: The tokens supported by this model
"""

return self._tokens

def get_trie(self, lang: str = None) -> TokenTrie:
"""
Gets the token trie for this model corresponding to a given language
:param lang: A language code (defaults to English, ``en``)
:return: The token trie corresponding to ``lang``
"""

return self._trie[lang]
return MODEL_ORDER[self.name]

def has(self, feature: 'TIFeature'):
"""
Expand All @@ -130,7 +66,7 @@ def has(self, feature: 'TIFeature'):
:return: Whether this model has ``feature``
"""

return feature in self._features
return feature in self.features

def OS(self, version: str = "") -> OsVersion:
"""
Expand Down
4 changes: 2 additions & 2 deletions tivars/models/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"""


import tivars.tokens.scripts.parse as parse
import tivars.tokens.scripts as tokens
from .model import *


class OsVersions(parse.OsVersions):
class OsVersions(tokens.OsVersions):
"""
Namespace containing useful OS versions
Expand Down
44 changes: 44 additions & 0 deletions tivars/token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from tivars.tokens.scripts import *


class TIToken(Token):
"""
Interface extension for the token sheets base ``Token`` container
TITokens can be fetched by bytes or recognized names from a `TITokens` container attached to a `TIModel`.
Instantiating your own `TIToken` is not recommended.
"""

def __init__(self, token: Token):
super().__init__(token.bits, token.langs, token.attrs, token.since, token.until)

self.translation = self.langs[None] = self.langs["en"]

def __repr__(self) -> str:
return f"<{self.display} ({self.escape})>"

@property
def accessible(self) -> str:
return self.translation.accessible

@property
def display(self) -> str:
return self.translation.display

@property
def escape(self) -> str:
return rf"\{'x' if len(self.bits) == 1 else 'u'}{self.bits.hex()}"

def names(self) -> list[str]:
return self.translation.names()


class IllegalToken(TIToken):
def __init__(self, bits: bytes):
self.bits = bits

super().__init__(Token(bits, {"en": Translation(b'?', "?", self.escape, [])},
{"illegal": "true"}))


__all__ = ["TIToken", "IllegalToken"]
5 changes: 4 additions & 1 deletion tivars/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from tivars.data import String
from tivars.models import *
from tivars.token import *
from tivars.tokens.scripts import *
from tivars.trie import *
from .decoder import *
from .encoder import *

Expand Down Expand Up @@ -41,6 +43,7 @@ class Name(TokenizedString):

@classmethod
def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
# Is this necessary?
mode = "max" if instance is not None and instance.leading_name_byte else "string"
data = encode(value, mode=mode)[0].rstrip(b'\x00')

Expand All @@ -52,4 +55,4 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:


__all__ = ["decode", "encode", "normalize", "Name", "TokenizedString",
"Token", "Tokens", "OsVersion", "OsVersions"]
"TIToken", "IllegalToken", "TITokenTrie", "TITokens", "OsVersion", "OsVersions"]
21 changes: 8 additions & 13 deletions tivars/tokenizer/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,22 @@
from warnings import warn

from tivars.models import *
from tivars.tokens.scripts import *
from tivars.token import *
from tivars.trie import *


def invalid_token(bits: bytes) -> Token:
name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}"
return Token(bits, {"en": Translation(b'?', "?", name, [])})



def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]:
def decode(bytestream: bytes, *, tokens: TITokens = None) -> tuple[list[TIToken], OsVersion]:
"""
Decodes a byte stream into a list of `Token` objects and its minimum supported OS version
Decodes a byte stream into a list of `TIToken` objects and its minimum supported OS version
Each token is represented using one of three different representations formats, dictated by ``mode``:
- ``display``: Represents the tokens with Unicode characters matching the calculator's display
- ``accessible``: Represents the tokens with ASCII-only equivalents, often requiring multi-character glyphs
- ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object)
:param bytestream: The token bytes to decode
:param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens)
:return: A tuple of a list of `Token` objects and a minimum `OsVersion`
:param tokens: The `TITokens` object to use for decoding (defaults to the TI-84+CE tokens)
:return: A tuple of a list of `TIToken` objects and a minimum `OsVersion`
"""

tokens = tokens or TI_84PCE.tokens
Expand All @@ -51,15 +46,15 @@ def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], Os
warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.",
BytesWarning)

out.append(invalid_token(curr_bytes))
out.append(IllegalToken(curr_bytes))
curr_bytes = b''

elif curr_bytes[-1]:
count = 0
while not curr_bytes[0]:
curr_bytes = curr_bytes[1:]
count += 1
out.append(invalid_token(b'\x00'))
out.append(IllegalToken(b'\x00'))

warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else
f"There is an unexpected null byte at position {index}.",
Expand Down
6 changes: 3 additions & 3 deletions tivars/tokenizer/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import unicodedata

from tivars.models import *
from tivars.tokens.scripts import *
from tivars.trie import *
from .state import *


def encode(string: str, *,
trie: TokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
trie: TITokenTrie = None, mode: str = None, normalize: bool = True) -> tuple[bytes, OsVersion]:
"""
Encodes a string of tokens represented as text into a byte stream and its minimum supported OS version
Expand Down Expand Up @@ -43,7 +43,7 @@ def encode(string: str, *,
"""

string = _normalize(string) if normalize else string
trie = trie or TI_84PCE.get_trie()
trie = trie or TI_84PCE.tokens.tries[None]
mode = mode or "smart"

data = b''
Expand Down
23 changes: 11 additions & 12 deletions tivars/tokenizer/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from string import punctuation

from tivars.tokens.scripts import *
from tivars.token import *
from tivars.trie import *


class EncoderState:
Expand All @@ -28,7 +29,7 @@ class EncoderState:
def __init__(self, length: int = 0):
self.length = length

def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['EncoderState']]:
def munch(self, string: str, trie: TITokenTrie) -> tuple[TIToken, str, list['EncoderState']]:
"""
Munch the input string and determine the resulting token, encoder state, and remainder of the string
Expand All @@ -41,13 +42,11 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder
if string.startswith(r"\x") or string.startswith(r"\u"):
length = 4 if string.startswith(r"\x") else 6
string, remainder = string[:length], string[length:]
token = Token(bytes.fromhex(string.lstrip(r"\ux")),
{"en": Translation(b'?', string, string, [])},
{"illegal": "true"})
token = IllegalToken(bytes.fromhex(string.lstrip(r"\ux")))

return token, remainder, self.next(token)

tokens = trie.get_tokens(string)
tokens = trie.match(string)
if not tokens:
raise ValueError("no tokenization options exist")

Expand All @@ -63,7 +62,7 @@ def munch(self, string: str, trie: TokenTrie) -> tuple[Token, str, list['Encoder

return token, remainder, self.next(token)

def next(self, token: Token) -> list['EncoderState']:
def next(self, token: TIToken) -> list['EncoderState']:
"""
Determines the next encode state given a token
Expand Down Expand Up @@ -102,7 +101,7 @@ class Line(EncoderState):
Encoder state which is always exited after a line break
"""

def next(self, token: Token) -> list[EncoderState]:
def next(self, token: TIToken) -> list[EncoderState]:
match token.bits:
case b'\x04' | b'\x3F':
return []
Expand All @@ -118,7 +117,7 @@ class Name(Line):

mode = -1

def next(self, token: Token) -> list[EncoderState]:
def next(self, token: TIToken) -> list[EncoderState]:
# Digits Uppercase letters (and theta)
if b'\x30' <= token.bits <= b'\x39' or b'\x41' <= token.bits <= b'\x5B':
return super().next(token)
Expand Down Expand Up @@ -150,7 +149,7 @@ class String(Line):

mode = -1

def next(self, token: Token) -> list[EncoderState]:
def next(self, token: TIToken) -> list[EncoderState]:
match token.bits:
case b'\x2A':
return []
Expand All @@ -176,7 +175,7 @@ class InterpolationStart(Line):

mode = 0

def next(self, token: Token) -> list[EncoderState]:
def next(self, token: TIToken) -> list[EncoderState]:
match token.bits:
case b'\x2A':
return [InterpolatedString()]
Expand All @@ -192,7 +191,7 @@ class SmartMode(EncoderState):

mode = 0

def next(self, token: Token) -> list[EncoderState]:
def next(self, token: TIToken) -> list[EncoderState]:
match token.bits:
case b'\x2A':
return [self, String()]
Expand Down
Loading

0 comments on commit 4aa2fdc

Please sign in to comment.