From 95a70b1659e4d82f65c0c05b027d3a60771b594c Mon Sep 17 00:00:00 2001 From: KG Date: Wed, 20 Dec 2023 15:22:14 -0500 Subject: [PATCH 1/5] Add TokenIDE file generator --- scripts/__init__.py | 3 +- scripts/tokenide.py | 178 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 scripts/tokenide.py diff --git a/scripts/__init__.py b/scripts/__init__.py index be7f51a..ab4d28c 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -1,4 +1,5 @@ from .parse import Token, Tokens, OsVersion, OsVersions, Translation +from .tokenide import TokenIDESheet from .trie import TokenTrie -__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", "TokenTrie"] +__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", "TokenIDESheet", "TokenTrie"] diff --git a/scripts/tokenide.py b/scripts/tokenide.py new file mode 100644 index 0000000..a224247 --- /dev/null +++ b/scripts/tokenide.py @@ -0,0 +1,178 @@ +import os +import xml.etree.ElementTree as ET + +from .parse import Tokens, OsVersion, OsVersions + + +COMMENT = """""" + + +class TokenIDESheet: + """ + Data class representing the contents of a TokenIDE token file + + The sheet is a dictionary with two elements: + - tokens: a recursing dictionary of tokens, indexed by byte + - meta: global metadata for TokenIDE concerning styling and grouping + + If an existing TokenIDE token file is not used a base, no metadata is present. + """ + + NAMESPACE = "http://merthsoft.com/Tokens" + + STARTERS = [b'\x2A'] + TERMINATORS = [b'\x04', b'*', b'\x3F'] + + def __init__(self, sheet: dict[str] = None): + self.sheet = sheet or {"tokens": {}, "meta": []} + + @staticmethod + def from_xml_string(xml_str: str) -> 'TokenIDESheet': + """ + Constructs an instance from an XML string + + :param xml_str: An XML string + :return: A TokenIDESheet corresponding to the string + """ + + return TokenIDESheet.from_element(ET.fromstring(xml_str)) + + @staticmethod + def from_element(root: ET.Element) -> 'TokenIDESheet': + """ + Constructs an instance from an XML element in a TokenIDE token file + + :param root: An XML element, which must be the root element of the file + :return: A TokenIDESheet corresponding to the root element + """ + + if root.tag != f"{{{TokenIDESheet.NAMESPACE}}}Tokens": + raise ValueError("Not a TokenIDE xml.") + + sheet: dict[str] = {"tokens": {}, "meta": []} + + def parse_page(element: ET.Element, dct: dict): + match element.tag.removeprefix(f"{{{TokenIDESheet.NAMESPACE}}}"): + case "Token": + attrib = element.attrib + + dct["tokens"][attrib.pop("byte")] = dct = {"string": attrib.pop("string", None), "variants": set(), + "attrib": attrib, "tokens": {}} + + case "Alt": + dct["variants"].add(element.attrib["string"]) + + case "Groups" | "Styles": + sheet["meta"].append(element) + + for child in element: + parse_page(child, dct) + + parse_page(root, sheet) + return TokenIDESheet(sheet) + + def to_xml_string(self) -> str: + """ + :return: This sheet as an indented XML string + """ + + element = self.to_element() + ET.indent(element, " ") + + # ET does not provide a method to insert a header comment + string = ET.tostring(element, encoding="utf8").decode() + string = string.replace("utf8", "utf-8") + return string.replace("?>", "?>\n" + COMMENT) + + def to_element(self) -> ET.Element: + """ + :return: This sheet as an XML element + """ + + sheet = ET.Element(f"{{{TokenIDESheet.NAMESPACE}}}Tokens", + {"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xmlns:xsd": "http://www.w3.org/2001/XMLSchema"}) + + sheet.extend(self.sheet["meta"]) + + def build_page(element: ET.Element, byte: str, dct: dict): + if byte: + element = ET.SubElement(element, "Token", byte=byte, + **({"string": dct["string"]} if dct.get("string", None) is not None else {}), + **dct.get("attrib", {})) + + for name in dct.get("variants", set()): + element.append(ET.Element("Alt", string=name)) + + for child in dict(sorted(dct.get("tokens", {}).items())): + build_page(element, child, dct["tokens"][child]) + + build_page(sheet, "", self.sheet) + return sheet + + def with_tokens(self, *, + version: OsVersion = None, tokens: Tokens = None, file=None, lang: str = 'en') -> 'TokenIDESheet': + """ + Constructs a copy of this sheet updated with the specified token data from the token sheets + + If a token is entirely absent, its accessible name is used as its string value. + Metadata is always preserved. + + :param version: A minimum OS version to target (defaults to latest) + :param tokens: A Tokens container of tokens to add (defaults to all tokens) + :param file: A file object to read tokens from (defaults to the 8X token sheet) + :param lang: A language code (defaults to "en") + :return: A TokenIDESheet containing the union of this sheet and the specified token data + """ + + sheet = self.sheet.copy() + + if tokens is None: + if file is None: + with open(os.path.join(os.path.dirname(__file__), "../8X.xml"), encoding="UTF-8") as file: + tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST) + + else: + tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST) + + for byte, token in tokens.bytes.items(): + if version is not None and token.since > version: + continue + + leading, trailing = byte[:1], byte[1:] + + dct = sheet["tokens"] + value = f"${leading.hex().upper()}" + + if value not in dct: + dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}} + + if trailing: + dct = dct[value]["tokens"] + value = f"${trailing.hex().upper()}" + + if value not in dct: + dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}} + + translation = token.langs.get(lang, "en") + dct[value]["string"] = dct[value]["string"] or translation.accessible + dct[value]["variants"] |= {name for name in translation.names() if name != dct[value]["string"]} + + if byte in TokenIDESheet.STARTERS: + dct[value]["attrib"]["stringStarter"] = "true" + + if byte in TokenIDESheet.TERMINATORS: + dct[value]["attrib"]["stringTerminator"] = "true" + + return TokenIDESheet(sheet) + + +ET.register_namespace("", TokenIDESheet.NAMESPACE) +__all__ = ["TokenIDESheet"] From 5a13f45f73e7944b82d74cf1d1c6b7312be1e1df Mon Sep 17 00:00:00 2001 From: KG Date: Wed, 20 Dec 2023 16:24:34 -0500 Subject: [PATCH 2/5] Improve child iterator --- scripts/tokenide.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/tokenide.py b/scripts/tokenide.py index a224247..270ba54 100644 --- a/scripts/tokenide.py +++ b/scripts/tokenide.py @@ -28,7 +28,7 @@ class TokenIDESheet: NAMESPACE = "http://merthsoft.com/Tokens" STARTERS = [b'\x2A'] - TERMINATORS = [b'\x04', b'*', b'\x3F'] + TERMINATORS = [b'\x04', b'\x2A', b'\x3F'] def __init__(self, sheet: dict[str] = None): self.sheet = sheet or {"tokens": {}, "meta": []} @@ -111,8 +111,8 @@ def build_page(element: ET.Element, byte: str, dct: dict): for name in dct.get("variants", set()): element.append(ET.Element("Alt", string=name)) - for child in dict(sorted(dct.get("tokens", {}).items())): - build_page(element, child, dct["tokens"][child]) + for child_byte, child_dct in sorted(dct.get("tokens", {}).items()): + build_page(element, child_byte, child_dct) build_page(sheet, "", self.sheet) return sheet From abc51438dcc0e0e743e6402586680582fbe771c0 Mon Sep 17 00:00:00 2001 From: KG Date: Wed, 20 Dec 2023 19:36:52 -0500 Subject: [PATCH 3/5] Incorporate display names where possible --- scripts/tokenide.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/tokenide.py b/scripts/tokenide.py index 270ba54..857e565 100644 --- a/scripts/tokenide.py +++ b/scripts/tokenide.py @@ -142,7 +142,13 @@ def with_tokens(self, *, else: tokens = Tokens.from_xml_string(file.read(), version or OsVersions.LATEST) - for byte, token in tokens.bytes.items(): + all_bytes = tokens.bytes + + display_names = [token.langs.get(lang, "en").display for token in all_bytes.values()] + names = [name for token in all_bytes.values() for name in token.langs.get(lang, "en").names()] + display_names + safe_display_names = {name for name in display_names if names.count(name) == 1} + + for byte, token in all_bytes.items(): if version is not None and token.since > version: continue @@ -162,8 +168,11 @@ def with_tokens(self, *, dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}} translation = token.langs.get(lang, "en") - dct[value]["string"] = dct[value]["string"] or translation.accessible - dct[value]["variants"] |= {name for name in translation.names() if name != dct[value]["string"]} + dct[value]["string"] = string = dct[value]["string"] or translation.accessible + dct[value]["variants"] |= {name for name in translation.names() if name != string} + + if string not in translation.display and translation.display in safe_display_names: + dct[value]["variants"].add(translation.display) if byte in TokenIDESheet.STARTERS: dct[value]["attrib"]["stringStarter"] = "true" From 310a6301bec1a9ba86e02318d76d3ca04dce5d27 Mon Sep 17 00:00:00 2001 From: KG Date: Sat, 23 Dec 2023 14:58:44 -0500 Subject: [PATCH 4/5] Add all redundancy checks --- scripts/tokenide.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/tokenide.py b/scripts/tokenide.py index 857e565..3292b99 100644 --- a/scripts/tokenide.py +++ b/scripts/tokenide.py @@ -144,9 +144,8 @@ def with_tokens(self, *, all_bytes = tokens.bytes - display_names = [token.langs.get(lang, "en").display for token in all_bytes.values()] - names = [name for token in all_bytes.values() for name in token.langs.get(lang, "en").names()] + display_names - safe_display_names = {name for name in display_names if names.count(name) == 1} + all_names = [name for token in all_bytes.values() + for name in [*token.langs.get(lang, "en").names(), token.langs.get(lang, "en").display]] for byte, token in all_bytes.items(): if version is not None and token.since > version: @@ -168,11 +167,18 @@ def with_tokens(self, *, dct[value] = {"string": None, "variants": set(), "attrib": {}, "tokens": {}} translation = token.langs.get(lang, "en") - dct[value]["string"] = string = dct[value]["string"] or translation.accessible - dct[value]["variants"] |= {name for name in translation.names() if name != string} + display = translation.display - if string not in translation.display and translation.display in safe_display_names: - dct[value]["variants"].add(translation.display) + if dct[value]["string"] not in [*translation.names(), display]: + dct[value]["string"] = translation.accessible + + dct[value]["variants"] |= {name for name in translation.names() if all_names.count(name) == 1} + + string = dct[value]["string"] + if string not in display and display not in string and all_names.count(display) == 1: + dct[value]["variants"].add(display) + + dct[value]["variants"] -= {string} if byte in TokenIDESheet.STARTERS: dct[value]["attrib"]["stringStarter"] = "true" From b842d95aa069159eff07a4990b024088d321067f Mon Sep 17 00:00:00 2001 From: KG Date: Sat, 23 Dec 2023 15:01:56 -0500 Subject: [PATCH 5/5] Update header comment --- scripts/tokenide.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/tokenide.py b/scripts/tokenide.py index 3292b99..99f85c8 100644 --- a/scripts/tokenide.py +++ b/scripts/tokenide.py @@ -10,7 +10,6 @@ TokenIDE created by Shaun McFall, Merthsoft Creations -shaunm.mcfall@gmail.com -->"""