Skip to content

Commit

Permalink
Add support for multiple munching modes
Browse files Browse the repository at this point in the history
  • Loading branch information
kg583 committed May 22, 2024
1 parent 1ec15a6 commit 59b494e
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 4 deletions.
9 changes: 9 additions & 0 deletions tests/tivars.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,15 @@ def test_doors(self):

self.assertEqual(test_program.decode(test_program.data[:26]), "Disp \"Needs Doors CSE\"")

def test_modes(self):
sneaky_prog = "Disp \"RIGHT\nLEFT"

test_max = TIProgram.encode(sneaky_prog, mode="max")
test_minmax = TIProgram.encode(sneaky_prog, mode="minmax")

self.assertEqual(test_max, b'\xDE\x2A\xEF\x94\x3F\xEF\x92')
self.assertEqual(test_minmax, b'\xDE\x2ARIGHT\x3F\xEF\x92')


class NumericTests(unittest.TestCase):
def real_float_test(self, real_type, filename, name, sign, exponent, mantissa, string, dec):
Expand Down
69 changes: 67 additions & 2 deletions tivars/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,35 @@
from tivars.tokens.scripts import *


STRING_STARTERS = b'\x2A'
"""
Token bytes which can start a string
"""

STRING_TERMINATORS = b'\x04\x2A\x3F'
"""
Token bytes which can end a string
"""


def decode(bytestream: bytes, *,
tokens: Tokens = None, lang: str = "en",
mode: str = "display") -> tuple[str | bytes, OsVersion]:
"""
Decodes a byte stream into a string of tokens and its minimum supported OS version
Each token is represented using one of three different representations formats, dictated by ``mode``:
- ``display``: Represents the tokens with Unicode characters matching the calculator's display
- ``accessible``: Represents the tokens with ASCII-only equivalents, often requiring multi-character glyphs
- ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object)
:param bytestream: The bytes to decode
:param tokens: The `Tokens` object to use for decoding
:param lang: The language used in ``string`` (defaults to English, ``en``)
:param mode: The form of token representation to use for output (defaults to ``display``)
:return: A tuple of a string of token representations and a minimum `OsVersion`
"""

tokens = tokens or TI_84PCE.tokens

out = []
Expand Down Expand Up @@ -43,13 +69,45 @@ def decode(bytestream: bytes, *,
return b''.join(out) if mode == "ti_ascii" else "".join(out), since


def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]:
def encode(string: str, trie: TokenTrie, mode: str = "max") -> tuple[bytes, OsVersion]:
"""
Encodes a string of token represented in text into a byte stream and its minimum supported OS version
Tokenization is performed using one of three procedures, dictated by ``mode``:
- ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token
- ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token
- ``minmax``: Munch maximally outside strings and minimally inside strings
For reference, here are the tokenization modes utilized by popular IDEs and other software:
- SourceCoder: ``max``
- TokenIDE: ``max``
- TI-Planet Project Builder: ``minmax``
- tivars_lib_cpp: ``minmax``
:param string: The tokens to encode
:param trie: The `TokenTrie` object to use for tokenization
:param mode: The tokenization mode to use (defaults to ``max``)
:return: A tuple of a stream of token bytes and a minimum `OsVersion`
"""

data = b''
since = OsVersions.INITIAL
index = 0
in_string = False

while string:
token, remainder = trie.get_longest_token(string)
match mode:
case "max":
token, remainder = trie.get_tokens(string)[0]

case "min":
token, remainder = trie.get_tokens(string)[-1]

case "minmax" | "maxmin":
token, remainder = trie.get_tokens(string)[-1 if in_string else 0]

case _:
raise ValueError(f"unrecognized tokenization mode: '{mode}'")

if token is None:
raise ValueError(f"could not tokenize input at position {index}: '{string[:12]}'")
Expand All @@ -60,6 +118,13 @@ def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]:
index += len(string) - len(remainder)
string = remainder

match in_string:
case False if token.bits in STRING_STARTERS:
in_string = True

case True if token.bits in STRING_TERMINATORS:
in_string = False

return data, since


Expand Down
16 changes: 14 additions & 2 deletions tivars/types/tokenized.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,30 @@ def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | byt
return decode(data, lang=lang, mode=mode)[0]

@staticmethod
def encode(string: str, *, model: TIModel = None, lang: str = None) -> bytes:
def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = "max") -> bytes:
"""
Encodes a string of token represented in text into a byte stream
Tokenization is performed using one of three procedures, dictated by ``mode``:
- ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token
- ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token
- ``minmax``: Munch maximally outside strings and minimally inside strings
For reference, here are the tokenization modes utilized by popular IDEs and other software:
- SourceCoder: ``max``
- TokenIDE: ``max``
- TI-Planet Project Builder: ``minmax``
- tivars_lib_cpp: ``minmax``
:param string: The tokens to encode
:param model: The model to target when encoding (defaults to no specific model)
:param lang: The language used in ``string`` (defaults to English, ``en``)
:param mode: The tokenization mode to use (defaults to ``max``)
:return: A stream of token bytes
"""

model = model or TI_84PCE
return encode(string, model.get_trie(lang))[0]
return encode(string, model.get_trie(lang), mode)[0]

def get_min_os(self, data: bytes = None) -> OsVersion:
return decode(data or self.data)[1]
Expand Down

0 comments on commit 59b494e

Please sign in to comment.