From 59b494ed86dd1f79149930f8aca57ba3cd8888e9 Mon Sep 17 00:00:00 2001 From: KG Date: Tue, 21 May 2024 23:40:49 -0500 Subject: [PATCH] Add support for multiple munching modes --- tests/tivars.py | 9 +++++ tivars/tokenizer/__init__.py | 69 ++++++++++++++++++++++++++++++++++-- tivars/types/tokenized.py | 16 +++++++-- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/tests/tivars.py b/tests/tivars.py index 121167f..3402f64 100644 --- a/tests/tivars.py +++ b/tests/tivars.py @@ -199,6 +199,15 @@ def test_doors(self): self.assertEqual(test_program.decode(test_program.data[:26]), "Disp \"Needs Doors CSE\"") + def test_modes(self): + sneaky_prog = "Disp \"RIGHT\nLEFT" + + test_max = TIProgram.encode(sneaky_prog, mode="max") + test_minmax = TIProgram.encode(sneaky_prog, mode="minmax") + + self.assertEqual(test_max, b'\xDE\x2A\xEF\x94\x3F\xEF\x92') + self.assertEqual(test_minmax, b'\xDE\x2ARIGHT\x3F\xEF\x92') + class NumericTests(unittest.TestCase): def real_float_test(self, real_type, filename, name, sign, exponent, mantissa, string, dec): diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py index b56a3db..81fd725 100644 --- a/tivars/tokenizer/__init__.py +++ b/tivars/tokenizer/__init__.py @@ -8,9 +8,35 @@ from tivars.tokens.scripts import * +STRING_STARTERS = b'\x2A' +""" +Token bytes which can start a string +""" + +STRING_TERMINATORS = b'\x04\x2A\x3F' +""" +Token bytes which can end a string +""" + + def decode(bytestream: bytes, *, tokens: Tokens = None, lang: str = "en", mode: str = "display") -> tuple[str | bytes, OsVersion]: + """ + Decodes a byte stream into a string of tokens and its minimum supported OS version + + Each token is represented using one of three different representations formats, dictated by ``mode``: + - ``display``: Represents the tokens with Unicode characters matching the calculator's display + - ``accessible``: Represents the tokens with ASCII-only equivalents, often requiring multi-character glyphs + - ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object) + + :param bytestream: The bytes to decode + :param tokens: The `Tokens` object to use for decoding + :param lang: The language used in ``string`` (defaults to English, ``en``) + :param mode: The form of token representation to use for output (defaults to ``display``) + :return: A tuple of a string of token representations and a minimum `OsVersion` + """ + tokens = tokens or TI_84PCE.tokens out = [] @@ -43,13 +69,45 @@ def decode(bytestream: bytes, *, return b''.join(out) if mode == "ti_ascii" else "".join(out), since -def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]: +def encode(string: str, trie: TokenTrie, mode: str = "max") -> tuple[bytes, OsVersion]: + """ + Encodes a string of token represented in text into a byte stream and its minimum supported OS version + + Tokenization is performed using one of three procedures, dictated by ``mode``: + - ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token + - ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token + - ``minmax``: Munch maximally outside strings and minimally inside strings + + For reference, here are the tokenization modes utilized by popular IDEs and other software: + - SourceCoder: ``max`` + - TokenIDE: ``max`` + - TI-Planet Project Builder: ``minmax`` + - tivars_lib_cpp: ``minmax`` + + :param string: The tokens to encode + :param trie: The `TokenTrie` object to use for tokenization + :param mode: The tokenization mode to use (defaults to ``max``) + :return: A tuple of a stream of token bytes and a minimum `OsVersion` + """ + data = b'' since = OsVersions.INITIAL index = 0 + in_string = False while string: - token, remainder = trie.get_longest_token(string) + match mode: + case "max": + token, remainder = trie.get_tokens(string)[0] + + case "min": + token, remainder = trie.get_tokens(string)[-1] + + case "minmax" | "maxmin": + token, remainder = trie.get_tokens(string)[-1 if in_string else 0] + + case _: + raise ValueError(f"unrecognized tokenization mode: '{mode}'") if token is None: raise ValueError(f"could not tokenize input at position {index}: '{string[:12]}'") @@ -60,6 +118,13 @@ def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]: index += len(string) - len(remainder) string = remainder + match in_string: + case False if token.bits in STRING_STARTERS: + in_string = True + + case True if token.bits in STRING_TERMINATORS: + in_string = False + return data, since diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py index 019405b..73fb3cc 100644 --- a/tivars/types/tokenized.py +++ b/tivars/types/tokenized.py @@ -76,18 +76,30 @@ def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | byt return decode(data, lang=lang, mode=mode)[0] @staticmethod - def encode(string: str, *, model: TIModel = None, lang: str = None) -> bytes: + def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = "max") -> bytes: """ Encodes a string of token represented in text into a byte stream + Tokenization is performed using one of three procedures, dictated by ``mode``: + - ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token + - ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token + - ``minmax``: Munch maximally outside strings and minimally inside strings + + For reference, here are the tokenization modes utilized by popular IDEs and other software: + - SourceCoder: ``max`` + - TokenIDE: ``max`` + - TI-Planet Project Builder: ``minmax`` + - tivars_lib_cpp: ``minmax`` + :param string: The tokens to encode :param model: The model to target when encoding (defaults to no specific model) :param lang: The language used in ``string`` (defaults to English, ``en``) + :param mode: The tokenization mode to use (defaults to ``max``) :return: A stream of token bytes """ model = model or TI_84PCE - return encode(string, model.get_trie(lang))[0] + return encode(string, model.get_trie(lang), mode)[0] def get_min_os(self, data: bytes = None) -> OsVersion: return decode(data or self.data)[1]