From 59b494ed86dd1f79149930f8aca57ba3cd8888e9 Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Tue, 21 May 2024 23:40:49 -0500
Subject: [PATCH] Add support for multiple munching modes

---
 tests/tivars.py              |  9 +++++
 tivars/tokenizer/__init__.py | 69 ++++++++++++++++++++++++++++++++++--
 tivars/types/tokenized.py    | 16 +++++++--
 3 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/tests/tivars.py b/tests/tivars.py
index 121167f..3402f64 100644
--- a/tests/tivars.py
+++ b/tests/tivars.py
@@ -199,6 +199,15 @@ def test_doors(self):
 
         self.assertEqual(test_program.decode(test_program.data[:26]), "Disp \"Needs Doors CSE\"")
 
+    def test_modes(self):
+        sneaky_prog = "Disp \"RIGHT\nLEFT"
+
+        test_max = TIProgram.encode(sneaky_prog, mode="max")
+        test_minmax = TIProgram.encode(sneaky_prog, mode="minmax")
+
+        self.assertEqual(test_max, b'\xDE\x2A\xEF\x94\x3F\xEF\x92')
+        self.assertEqual(test_minmax, b'\xDE\x2ARIGHT\x3F\xEF\x92')
+
 
 class NumericTests(unittest.TestCase):
     def real_float_test(self, real_type, filename, name, sign, exponent, mantissa, string, dec):
diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py
index b56a3db..81fd725 100644
--- a/tivars/tokenizer/__init__.py
+++ b/tivars/tokenizer/__init__.py
@@ -8,9 +8,35 @@
 from tivars.tokens.scripts import *
 
 
+STRING_STARTERS = b'\x2A'
+"""
+Token bytes which can start a string
+"""
+
+STRING_TERMINATORS = b'\x04\x2A\x3F'
+"""
+Token bytes which can end a string
+"""
+
+
 def decode(bytestream: bytes, *,
            tokens: Tokens = None, lang: str = "en",
            mode: str = "display") -> tuple[str | bytes, OsVersion]:
+    """
+    Decodes a byte stream into a string of tokens and its minimum supported OS version
+
+    Each token is represented using one of three different representations formats, dictated by ``mode``:
+        - ``display``: Represents the tokens with Unicode characters matching the calculator's display
+        - ``accessible``: Represents the tokens with ASCII-only equivalents, often requiring multi-character glyphs
+        - ``ti_ascii``: Represents the tokens with their internal font indices (returns a ``bytes`` object)
+
+    :param bytestream: The bytes to decode
+    :param tokens: The `Tokens` object to use for decoding
+    :param lang: The language used in ``string`` (defaults to English, ``en``)
+    :param mode: The form of token representation to use for output (defaults to ``display``)
+    :return: A tuple of a string of token representations and a minimum `OsVersion`
+    """
+
     tokens = tokens or TI_84PCE.tokens
 
     out = []
@@ -43,13 +69,45 @@ def decode(bytestream: bytes, *,
     return b''.join(out) if mode == "ti_ascii" else "".join(out), since
 
 
-def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]:
+def encode(string: str, trie: TokenTrie, mode: str = "max") -> tuple[bytes, OsVersion]:
+    """
+    Encodes a string of token represented in text into a byte stream and its minimum supported OS version
+
+    Tokenization is performed using one of three procedures, dictated by ``mode``:
+        - ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token
+        - ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token
+        - ``minmax``: Munch maximally outside strings and minimally inside strings
+
+    For reference, here are the tokenization modes utilized by popular IDEs and other software:
+        - SourceCoder: ``max``
+        - TokenIDE: ``max``
+        - TI-Planet Project Builder: ``minmax``
+        - tivars_lib_cpp: ``minmax``
+
+    :param string: The tokens to encode
+    :param trie: The `TokenTrie` object to use for tokenization
+    :param mode: The tokenization mode to use (defaults to ``max``)
+    :return: A tuple of a stream of token bytes and a minimum `OsVersion`
+    """
+
     data = b''
     since = OsVersions.INITIAL
     index = 0
+    in_string = False
 
     while string:
-        token, remainder = trie.get_longest_token(string)
+        match mode:
+            case "max":
+                token, remainder = trie.get_tokens(string)[0]
+
+            case "min":
+                token, remainder = trie.get_tokens(string)[-1]
+
+            case "minmax" | "maxmin":
+                token, remainder = trie.get_tokens(string)[-1 if in_string else 0]
+
+            case _:
+                raise ValueError(f"unrecognized tokenization mode: '{mode}'")
 
         if token is None:
             raise ValueError(f"could not tokenize input at position {index}: '{string[:12]}'")
@@ -60,6 +118,13 @@ def encode(string: str, trie: TokenTrie) -> tuple[bytes, OsVersion]:
         index += len(string) - len(remainder)
         string = remainder
 
+        match in_string:
+            case False if token.bits in STRING_STARTERS:
+                in_string = True
+
+            case True if token.bits in STRING_TERMINATORS:
+                in_string = False
+
     return data, since
 
 
diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py
index 019405b..73fb3cc 100644
--- a/tivars/types/tokenized.py
+++ b/tivars/types/tokenized.py
@@ -76,18 +76,30 @@ def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | byt
         return decode(data, lang=lang, mode=mode)[0]
 
     @staticmethod
-    def encode(string: str, *, model: TIModel = None, lang: str = None) -> bytes:
+    def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = "max") -> bytes:
         """
         Encodes a string of token represented in text into a byte stream
 
+        Tokenization is performed using one of three procedures, dictated by ``mode``:
+            - ``max``: Always munch maximally, i.e. consume the *longest* possible string to produce a token
+            - ``min``: Always munch minimally, i.e. consume the *shortest* possible string to produce a token
+            - ``minmax``: Munch maximally outside strings and minimally inside strings
+
+        For reference, here are the tokenization modes utilized by popular IDEs and other software:
+            - SourceCoder: ``max``
+            - TokenIDE: ``max``
+            - TI-Planet Project Builder: ``minmax``
+            - tivars_lib_cpp: ``minmax``
+
         :param string: The tokens to encode
         :param model: The model to target when encoding (defaults to no specific model)
         :param lang: The language used in ``string`` (defaults to English, ``en``)
+        :param mode: The tokenization mode to use (defaults to ``max``)
         :return: A stream of token bytes
         """
 
         model = model or TI_84PCE
-        return encode(string, model.get_trie(lang))[0]
+        return encode(string, model.get_trie(lang), mode)[0]
 
     def get_min_os(self, data: bytes = None) -> OsVersion:
         return decode(data or self.data)[1]