From 95a70b1659e4d82f65c0c05b027d3a60771b594c Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Wed, 20 Dec 2023 15:22:14 -0500
Subject: [PATCH 1/5] Add TokenIDE file generator

---
 scripts/__init__.py |   3 +-
 scripts/tokenide.py | 178 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 scripts/tokenide.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
index be7f51a..ab4d28c 100644
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -1,4 +1,5 @@
 from .parse import Token, Tokens, OsVersion, OsVersions, Translation
+from .tokenide import TokenIDESheet
 from .trie import TokenTrie
 
-__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", "TokenTrie"]
+__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", "TokenIDESheet", "TokenTrie"]
diff --git a/scripts/tokenide.py b/scripts/tokenide.py
new file mode 100644
index 0000000..a224247
--- /dev/null
+++ b/scripts/tokenide.py
@@ -0,0 +1,178 @@
+import os
+import xml.etree.ElementTree as ET
+
+from .parse import Tokens, OsVersion, OsVersions
+
+
+COMMENT = """<!--
+TokenIDE-compatible token file generated using the TI-Toolkit token sheets:
+https://github.com/TI-Toolkit/tokens
+
+TokenIDE created by
+Shaun McFall, Merthsoft Creations
+shaunm.mcfall@gmail.com
+-->"""
+
+
+class TokenIDESheet:
+    """
+    Data class representing the contents of a TokenIDE token file
+
+    The sheet is a dictionary with two elements:
+        - tokens:   a recursing dictionary of tokens, indexed by byte
+        - meta:     global metadata for TokenIDE concerning styling and grouping
+
+    If an existing TokenIDE token file is not used a base, no metadata is present.
+    """
+
+    NAMESPACE = "http://merthsoft.com/Tokens"
+
+    STARTERS = [b'\x2A']
+    TERMINATORS = [b'\x04', b'*', b'\x3F']
+
+    def __init__(self, sheet: dict[str] = None):
+        self.sheet = sheet or {"tokens": {}, "meta": []}
+
+    @staticmethod
+    def from_xml_string(xml_str: str) -> 'TokenIDESheet':
+        """
+        Constructs an instance from an XML string
+
+        :param xml_str: An XML string
+        :return: A TokenIDESheet corresponding to the string
+        """
+
+        return TokenIDESheet.from_element(ET.fromstring(xml_str))
+
+    @staticmethod
+    def from_element(root: ET.Element) -> 'TokenIDESheet':
+        """
+        Constructs an instance from an XML element in a TokenIDE token file
+
+        :param root: An XML element, which must be the root element of the file
+        :return: A TokenIDESheet corresponding to the root element
+        """
+
+        if root.tag != f"{{{TokenIDESheet.NAMESPACE}}}Tokens":
+            raise ValueError("Not a TokenIDE xml.")
+
+        sheet: dict[str] = {"tokens": {}, "meta": []}
+
+        def parse_page(element: ET.Element, dct: dict):
+            match element.tag.removeprefix(f"{{{TokenIDESheet.NAMESPACE}}}"):
+                case "Token":
+                    attrib = element.attrib
+
+                    dct["tokens"][attrib.pop("byte")] = dct = {"string": attrib.pop("string", None), "variants": set(),
+                                                               "attrib": attrib, "tokens": {}}
+
+                case "Alt":
+                    dct["variants"].add(element.attrib["string"])
+
+                case "Groups" | "Styles":
+                    sheet["meta"].append(element)
+
+            for child in element:
+                parse_page(child, dct)
+
+        parse_page(root, sheet)
+        return TokenIDESheet(sheet)
+
+    def to_xml_string(self) -> str:
+        """
+        :return: This sheet as an indented XML string
+        """
+
+        element = self.to_element()
+        ET.indent(element, "  ")
+
+        # ET does not provide a method to insert a header comment
+        string = ET.tostring(element, encoding="utf8").decode()
+        string = string.replace("utf8", "utf-8")
+        return string.replace("?>", "?>\n" + COMMENT)
+
+    def to_element(self) -> ET.Element:
+        """
+        :return: This sheet as an XML element
+        """
+
+        sheet = ET.Element(f"{{{TokenIDESheet.NAMESPACE}}}Tokens",
+                           {"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
+                            "xmlns:xsd": "http://www.w3.org/2001/XMLSchema"})
+
+        sheet.extend(self.sheet["meta"])
+
+        def build_page(element: ET.Element, byte: str, dct: dict):
+            if byte:
+                element = ET.SubElement(element, "Token", byte=byte,
+                                        **({"string": dct["string"]} if dct.get("string", None) is not None else {}),
+                                        **dct.get("attrib", {}))
+
+                for name in dct.get("variants", set()):
+                    element.append(ET.Element("Alt", string=name))
+
+            for child in dict(sorted(dct.get("tokens", {}).items())):
+                build_page(element, child, dct["tokens"][child])
+
+        build_page(sheet, "", self.sheet)
+        return sheet
+
+    def with_tokens(self, *,
+                    version: OsVersion = None, tokens: Tokens = None, file=None, lang: str = 'en') -> 'TokenIDESheet':
+        """
+        Constructs a copy of this sheet updated with the specified token data from the token sheets
+
+        If a token is entirely absent, its accessible name is used as its string value.
+        Metadata is always preserved.
+
+        :param version: A minimum OS version to target (defaults to latest)
+        :param tokens: A Tokens container of tokens to add (defaults to all tokens)
+        :param file: A file object to read tokens from (defaults to the 8X token sheet)
+        :param lang: A language code (defaults to "en")
+        :return: A TokenIDESheet containing the union of this sheet and the specified token data
+        """
+
+        sheet = self.sheet.copy()
+
+        if tokens is None:
+            if file is None:
+                with open(os.path.join(os.path.dirname(__file__), "../8X.xml"), encoding="UTF-8") as file:
+                    tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST)
+
+            else:
+                tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST)
+
+        for byte, token in tokens.bytes.items():
+            if version is not None and token.since > version:
+                continue
+
+            leading, trailing = byte[:1], byte[1:]
+
+            dct = sheet["tokens"]
+            value = f"${leading.hex().upper()}"
+
+            if value not in dct:
+                dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}}
+
+            if trailing:
+                dct = dct[value]["tokens"]
+                value = f"${trailing.hex().upper()}"
+
+                if value not in dct:
+                    dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}}
+
+            translation = token.langs.get(lang, "en")
+            dct[value]["string"] = dct[value]["string"] or translation.accessible
+            dct[value]["variants"] |= {name for name in translation.names() if name != dct[value]["string"]}
+
+            if byte in TokenIDESheet.STARTERS:
+                dct[value]["attrib"]["stringStarter"] = "true"
+
+            if byte in TokenIDESheet.TERMINATORS:
+                dct[value]["attrib"]["stringTerminator"] = "true"
+
+        return TokenIDESheet(sheet)
+
+
+ET.register_namespace("", TokenIDESheet.NAMESPACE)
+__all__ = ["TokenIDESheet"]

From 5a13f45f73e7944b82d74cf1d1c6b7312be1e1df Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Wed, 20 Dec 2023 16:24:34 -0500
Subject: [PATCH 2/5] Improve child iterator

---
 scripts/tokenide.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/tokenide.py b/scripts/tokenide.py
index a224247..270ba54 100644
--- a/scripts/tokenide.py
+++ b/scripts/tokenide.py
@@ -28,7 +28,7 @@ class TokenIDESheet:
     NAMESPACE = "http://merthsoft.com/Tokens"
 
     STARTERS = [b'\x2A']
-    TERMINATORS = [b'\x04', b'*', b'\x3F']
+    TERMINATORS = [b'\x04', b'\x2A', b'\x3F']
 
     def __init__(self, sheet: dict[str] = None):
         self.sheet = sheet or {"tokens": {}, "meta": []}
@@ -111,8 +111,8 @@ def build_page(element: ET.Element, byte: str, dct: dict):
                 for name in dct.get("variants", set()):
                     element.append(ET.Element("Alt", string=name))
 
-            for child in dict(sorted(dct.get("tokens", {}).items())):
-                build_page(element, child, dct["tokens"][child])
+            for child_byte, child_dct in sorted(dct.get("tokens", {}).items()):
+                build_page(element, child_byte, child_dct)
 
         build_page(sheet, "", self.sheet)
         return sheet

From abc51438dcc0e0e743e6402586680582fbe771c0 Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Wed, 20 Dec 2023 19:36:52 -0500
Subject: [PATCH 3/5] Incorporate display names where possible

---
 scripts/tokenide.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scripts/tokenide.py b/scripts/tokenide.py
index 270ba54..857e565 100644
--- a/scripts/tokenide.py
+++ b/scripts/tokenide.py
@@ -142,7 +142,13 @@ def with_tokens(self, *,
             else:
                 tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST)
 
-        for byte, token in tokens.bytes.items():
+        all_bytes = tokens.bytes
+
+        display_names = [token.langs.get(lang, "en").display for token in all_bytes.values()]
+        names = [name for token in all_bytes.values() for name in token.langs.get(lang, "en").names()] + display_names
+        safe_display_names = {name for name in display_names if names.count(name) == 1}
+
+        for byte, token in all_bytes.items():
             if version is not None and token.since > version:
                 continue
 
@@ -162,8 +168,11 @@ def with_tokens(self, *,
                     dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}}
 
             translation = token.langs.get(lang, "en")
-            dct[value]["string"] = dct[value]["string"] or translation.accessible
-            dct[value]["variants"] |= {name for name in translation.names() if name != dct[value]["string"]}
+            dct[value]["string"] = string = dct[value]["string"] or translation.accessible
+            dct[value]["variants"] |= {name for name in translation.names() if name != string}
+
+            if string not in translation.display and translation.display in safe_display_names:
+                dct[value]["variants"].add(translation.display)
 
             if byte in TokenIDESheet.STARTERS:
                 dct[value]["attrib"]["stringStarter"] = "true"

From 310a6301bec1a9ba86e02318d76d3ca04dce5d27 Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Sat, 23 Dec 2023 14:58:44 -0500
Subject: [PATCH 4/5] Add all redundancy checks

---
 scripts/tokenide.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/tokenide.py b/scripts/tokenide.py
index 857e565..3292b99 100644
--- a/scripts/tokenide.py
+++ b/scripts/tokenide.py
@@ -144,9 +144,8 @@ def with_tokens(self, *,
 
         all_bytes = tokens.bytes
 
-        display_names = [token.langs.get(lang, "en").display for token in all_bytes.values()]
-        names = [name for token in all_bytes.values() for name in token.langs.get(lang, "en").names()] + display_names
-        safe_display_names = {name for name in display_names if names.count(name) == 1}
+        all_names = [name for token in all_bytes.values()
+                     for name in [*token.langs.get(lang, "en").names(), token.langs.get(lang, "en").display]]
 
         for byte, token in all_bytes.items():
             if version is not None and token.since > version:
@@ -168,11 +167,18 @@ def with_tokens(self, *,
                     dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}}
 
             translation = token.langs.get(lang, "en")
-            dct[value]["string"] = string = dct[value]["string"] or translation.accessible
-            dct[value]["variants"] |= {name for name in translation.names() if name != string}
+            display = translation.display
 
-            if string not in translation.display and translation.display in safe_display_names:
-                dct[value]["variants"].add(translation.display)
+            if dct[value]["string"] not in [*translation.names(), display]:
+                dct[value]["string"] = translation.accessible
+
+            dct[value]["variants"] |= {name for name in translation.names() if all_names.count(name) == 1}
+
+            string = dct[value]["string"]
+            if string not in display and display not in string and all_names.count(display) == 1:
+                dct[value]["variants"].add(display)
+
+            dct[value]["variants"] -= {string}
 
             if byte in TokenIDESheet.STARTERS:
                 dct[value]["attrib"]["stringStarter"] = "true"

From b842d95aa069159eff07a4990b024088d321067f Mon Sep 17 00:00:00 2001
From: KG <kgscience@hotmail.com>
Date: Sat, 23 Dec 2023 15:01:56 -0500
Subject: [PATCH 5/5] Update header comment

---
 scripts/tokenide.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/tokenide.py b/scripts/tokenide.py
index 3292b99..99f85c8 100644
--- a/scripts/tokenide.py
+++ b/scripts/tokenide.py
@@ -10,7 +10,6 @@
 
 TokenIDE created by
 Shaun McFall, Merthsoft Creations
-shaunm.mcfall@gmail.com
 -->"""