Skip to content

Commit

Permalink
Add token accessors and rework name converters
Browse files Browse the repository at this point in the history
  • Loading branch information
kg583 committed Dec 19, 2024
1 parent 5ac72c2 commit 05208f9
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 56 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ All entry types support string formatting using Python's f-strings.
- `width`: how many digits to group together *(default: no groups)*
- Tokenized entries support formatting of their tokens into readable lines: `{line_spec}{sep}{type}{lang}`
- `line_spec`: format specifier for line numbers *(default: no line numbers)*
- `sep`: a string to separate lines and line numbers *(default: none)*
- `sep`: a string to separate lines and line numbers *(required for line numbering)*
- `type`: how to format each token
- `a`: use accessible names
- `d`: use display names *(default)*
Expand Down
22 changes: 18 additions & 4 deletions tivars/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,28 @@ class TokenizedString(String):
"""
Converter for data sections best interpreted as strings of tokens
Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens.
Tokenization uses the TI-84+CE token sheet.
"""

_T = str

@classmethod
def get(cls, data: bytes, **kwargs) -> _T:
return decode(data.ljust(8, b'\x00'))[0]
return "".join(token.langs["en"].display for token in decode(data.ljust(8, b'\x00'))[0])

@classmethod
def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
return encode(value)[0].rstrip(b'\x00')


class Name(TokenizedString):
"""
Converter for names of vars
Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens.
"""

_T = str

@classmethod
def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
Expand All @@ -37,5 +51,5 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes:
return data


__all__ = ["decode", "encode", "normalize", "TokenizedString",
"Tokens", "OsVersion", "OsVersions"]
__all__ = ["decode", "encode", "normalize", "Name", "TokenizedString",
"Token", "Tokens", "OsVersion", "OsVersions"]
49 changes: 17 additions & 32 deletions tivars/tokenizer/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@
from tivars.tokens.scripts import *


def decode(bytestream: bytes, *,
tokens: Tokens = None, lang: str = "en",
mode: str = "display") -> tuple[str | bytes, OsVersion]:
def invalid_token(bits: bytes) -> Token:
name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}"
return Token(bits, {"en": Translation(b'?', "?", name, [])})



def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]:
"""
Decodes a byte stream into a string of tokens and its minimum supported OS version
Decodes a byte stream into a list of `Token` objects and its minimum supported OS version
Each token is represented using one of three different representations formats, dictated by ``mode``:
- ``display``: Represents the tokens with Unicode characters matching the calculator's display
Expand All @@ -22,18 +26,14 @@ def decode(bytestream: bytes, *,
:param bytestream: The token bytes to decode
:param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens)
:param lang: The language used in ``string`` (defaults to English, ``en``)
:param mode: The form of token representation to use for output (defaults to ``display``)
:return: A tuple of a string of token representations and a minimum `OsVersion`
:return: A tuple of a list of `Token` objects and a minimum `OsVersion`
"""

tokens = tokens or TI_84PCE.tokens

out = []
since = OsVersions.INITIAL

byte_attr = mode == "ti_ascii"

index = 0
curr_bytes = b''
while index < len(bytestream):
Expand All @@ -42,50 +42,35 @@ def decode(bytestream: bytes, *,

if curr_bytes[0]:
if curr_bytes in tokens.bytes:
try:
out.append(getattr(tokens.bytes[curr_bytes].langs[lang], mode))

except AttributeError:
raise ValueError(f"'{mode}' is not a recognized token representation")

except KeyError:
raise ValueError(f"'{lang}' is not a recognized language")

out.append(tokens.bytes[curr_bytes])
since = max(tokens.bytes[curr_bytes].since, since)

curr_bytes = b''

elif len(curr_bytes) >= 2:
if not any(key.startswith(curr_bytes[:1]) for key in tokens.bytes):
warn(f"Unrecognized byte '0x{curr_hex}' at position {index}.",
BytesWarning)

out.append(b'?' if byte_attr else rf"\x{curr_hex}")

else:
warn(f"Unrecognized bytes '0x{curr_hex}' at position {index}.",
BytesWarning)

out.append(b'?' if byte_attr else rf"\u{curr_hex}")
warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.",
BytesWarning)

out.append(invalid_token(curr_bytes))
curr_bytes = b''

elif any(curr_bytes):
elif curr_bytes[-1]:
count = 0
while not curr_bytes[0]:
curr_bytes = curr_bytes[1:]
count += 1
out.append(b'?' if byte_attr else r"\x00")
out.append(invalid_token(b'\x00'))

warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else
f"There is an unexpected null byte at position {index}.",
BytesWarning)

curr_bytes = b''
index -= 1

index += 1

return b''.join(out) if byte_attr else "".join(out), since
return out, since


__all__ = ["decode"]
2 changes: 1 addition & 1 deletion tivars/types/gdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def json_name(self) -> str:
:return: The name of this equation used in the GDB JSON format
"""

return decode(self.raw.name, mode="accessible")[0].strip("{}|")
return self.decode(self.raw.name, mode="accessible").strip("{}|")

def load_data_section(self, data: BytesIO):
flag_byte = data.read(1)
Expand Down
2 changes: 1 addition & 1 deletion tivars/types/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .real import RealEntry


class ListName(TokenizedString):
class ListName(Name):
"""
Converter for the name section of lists
Expand Down
4 changes: 2 additions & 2 deletions tivars/types/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from tivars.data import *
from tivars.models import *
from tivars.tokenizer import TokenizedString
from tivars.tokenizer import Name
from tivars.var import SizedEntry

RGB = tuple[int, int, int]
Expand Down Expand Up @@ -331,7 +331,7 @@ def array(self) -> list[list[pixel_type]]:


# Workaround until the token sheets are updated
class ImageName(TokenizedString):
class ImageName(Name):
"""
Converter for the name section of images
Expand Down
52 changes: 39 additions & 13 deletions tivars/types/tokenized.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import re

from io import BytesIO
from typing import Sequence
from warnings import catch_warnings, simplefilter, warn

from tivars.data import *
Expand Down Expand Up @@ -43,9 +44,8 @@ class TokenizedEntry(SizedEntry):

def __format__(self, format_spec: str) -> str:
try:
lines, sep, spec, lang = re.match(r"(.*?[a-z%#])?(\W*)(\w?)\.?(\w+)?", format_spec).groups()
line_number = f"{{index:{lines}}}{sep}" if lines else sep
lang = lang or "en"
lines, sep, spec, lang = re.match(r"(?:(.*?[a-z%#])(\W+))?(\w?)(\.\w+)?$", format_spec).groups()
lang = (lang or ".en")[1:]

match spec:
case "" | "d":
Expand All @@ -57,25 +57,31 @@ def __format__(self, format_spec: str) -> str:
case _:
raise KeyError

return "\n".join(line_number.format(index=index) + line for index, line in enumerate(string.split("\n")))
if lines:
return "\n".join(f"{index:{lines}}{sep}" + line for index, line in enumerate(string.split("\n")))

else:
return string

except (AttributeError, KeyError, TypeError, ValueError):
return super().__format__(format_spec)

@staticmethod
def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | bytes:
def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str:
"""
Decodes a byte stream into a string of tokens
For detailed information on tokenization modes, see `tivars.tokenizer.decode`.
:param data: The token bytes to decode
:param lang: The language used in ``string`` (defaults to English, ``en``)
:param mode: The form of token representation to use for output (defaults to ``display``)
:return: A string of token representations
"""

return decode(data, lang=lang, mode=mode)[0]
try:
return "".join(getattr(token.langs[lang], mode) for token in decode(data)[0])

except (AttributeError, TypeError):
raise ValueError(f"unrecognized tokenization mode: '{mode}'")

@staticmethod
def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = None) -> bytes:
Expand Down Expand Up @@ -165,6 +171,23 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m

self.data = self.encode(string, model=model, lang=lang, mode=mode)

@Loader[Sequence[Token]]
def load_tokens(self, tokens: Sequence[Token]):
"""
Loads this entry from a sequence of `Token` objects
:param tokens: The sequence of tokens to load
"""

self.data = b''.join(token.bits for token in tokens)

def tokens(self) -> list[Token]:
"""
:return: The tokens comprising this entry as a list of `Token` objects
"""

return decode(self.data)[0]


class TIEquation(TokenizedEntry, register=True):
"""
Expand All @@ -191,7 +214,7 @@ def __init__(self, init=None, *,

super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data)

@Section(8, TokenizedString)
@Section(8, Name)
def name(self, value) -> str:
"""
The name of the entry
Expand Down Expand Up @@ -245,7 +268,7 @@ def __init__(self, init=None, *,

super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data)

@Section(8, TokenizedString)
@Section(8, Name)
def name(self, value) -> str:
"""
The name of the entry
Expand Down Expand Up @@ -335,13 +358,16 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m
super().load_string(string, model=model, lang=lang, mode=mode)

def string(self) -> str:
string = super().string()

if not self.is_tokenized:
warn("ASM programs may not have tokenized data.",
UserWarning)

return string
with catch_warnings():
simplefilter("ignore")
return super().string()

else:
return super().string()

def coerce(self):
with catch_warnings():
Expand Down
4 changes: 2 additions & 2 deletions tivars/var.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from .data import *
from .models import *
from .tokenizer import TokenizedString
from .tokenizer import Name


match version_info[:2]:
Expand Down Expand Up @@ -515,7 +515,7 @@ def type_id(self) -> int:
The type determines how the contents of the data section of the entry are interpreted.
"""

@Section(8, TokenizedString)
@Section(8, Name)
def name(self) -> str:
"""
The name of the entry
Expand Down

0 comments on commit 05208f9

Please sign in to comment.