diff --git a/.github/workflows/.clear-in-built b/.github/workflows/.clear-in-built new file mode 100644 index 0000000..be36042 --- /dev/null +++ b/.github/workflows/.clear-in-built @@ -0,0 +1,2 @@ +.github/* +scripts/* diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 008ab85..a047b87 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -3,20 +3,51 @@ on: branches: [ 'main' ] pull_request: branches: [ 'main' ] + types: [opened, synchronize, reopened, ready_for_review] jobs: - validate: - name: Validate XML files with XMLStarlet + build: + name: Build and validate token sheets runs-on: ubuntu-latest + + if: github.event.pull_request.draft == false steps: - - uses: actions/checkout@v3 - - - name: Validate 73 tokens - uses: Mudlet/xmlstarlet-action@v1.1 + - name: Checkout sheets + uses: actions/checkout@v4 + + - name: Run build script + run: | + mkdir built + python -m scripts.build + + - name: Upload artifact + uses: actions/upload-artifact@v4 with: - args: 'val -b 73.xml' - - - name: Validate 8X tokens - uses: Mudlet/xmlstarlet-action@v1.1 + name: built + path: built/ + + commit: + name: Push sheets to built branch + runs-on: ubuntu-latest + + permissions: write-all + needs: build + + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + steps: + - name: Download artifact + uses: actions/download-artifact@v4 with: - args: 'val -b 8X.xml' + name: built + path: built/ + + - name: Save to built branch + uses: s0/git-publish-subdir-action@develop + env: + REPO: self + BRANCH: built + FOLDER: built + SKIP_EMPTY_COMMITS: true + CLEAR_GLOBS_FILE: ".github/.clear-in-built" + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + MESSAGE: "Build {sha}: {msg}" diff --git a/scripts/__init__.py b/scripts/__init__.py index ab4d28c..602a4eb 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -1,5 +1,7 @@ +from .formats import to_json, validate from .parse import Token, Tokens, OsVersion, OsVersions, Translation from .tokenide import TokenIDESheet from .trie import TokenTrie -__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", "TokenIDESheet", "TokenTrie"] +__all__ = ["Token", "Tokens", "OsVersion", "OsVersions", "Translation", + "TokenTrie", "TokenIDESheet", "to_json", "validate"] diff --git a/scripts/build.py b/scripts/build.py new file mode 100644 index 0000000..cbe9d03 --- /dev/null +++ b/scripts/build.py @@ -0,0 +1,23 @@ +import json +import xml.etree.ElementTree as ET + +from .formats import * + + +with open("8X.xml", encoding="UTF-8") as infile: + root = ET.fromstring(src := infile.read()) + + with open("built/8X.xml", "w+", encoding="UTF-8") as outfile: + validate(root) + outfile.write(src) + + with open("built/8X.json", "w+", encoding="UTF-8") as outfile: + json.dump(to_json(root), outfile, indent=2, ensure_ascii=False) + + +with open("73.xml", encoding="UTF-8") as infile: + root = ET.fromstring(src := infile.read()) + + with open("built/73.xml", "w+", encoding="UTF-8") as outfile: + validate(root, for_73=True) + outfile.write(src) diff --git a/scripts/formats.py b/scripts/formats.py new file mode 100644 index 0000000..5c73eb0 --- /dev/null +++ b/scripts/formats.py @@ -0,0 +1,200 @@ +import json +import re +import xml.etree.ElementTree as ET + +from collections import defaultdict + +from .parse import OsVersion, OsVersions + + +def validate(root: ET.Element, *, for_73: bool = False) -> int: + """ + Validates a token sheet, raising an error if an invalid component is found + + :param root: An XML element, which must be the root element of the sheet + :param for_73: Whether to use the 73 sheet validator (defaults to False) + :return: The number of tokens in the sheet + """ + + if root.tag != "tokens": + raise ValueError("not a token sheet") + + all_tokens = set() + all_names = {} + + version = None + + def visit(element: ET.Element, byte: str = "", lang: str = ""): + nonlocal version + + byte += element.attrib.get("value", "").lstrip("$") + lang += element.attrib.get("code", "") + + class ValidationError(ValueError): + __qualname__ = "ValidationError" + + def __init__(self, message: str): + super().__init__((f"token 0x{byte}: " if byte else "root: ") + message) + + # Require attributes matching regexes + def attributes(attrs: dict[str, str]): + attrib = element.attrib.copy() + + for attr, regex in attrs.items(): + if attr not in attrib: + raise ValidationError(f"<{element.tag}> does not have attribute {attr}") + + if not re.fullmatch(regex, value := attrib.pop(attr)): + raise ValidationError(f"<{element.tag}> {attr} '{value}' does not match r'{regex}'") + + if attrib: + raise ValidationError(f"<{element.tag}> has unexpected attribute {[*attrib.values()][0]}") + + # Require child tags to match regex when appended in order + def children(regex: str): + if not re.fullmatch(regex, "".join(f"<{child.tag}>" for child in element)): + raise ValidationError(f"children of <{element.tag}> do not match r'{regex}'") + + # Require text to match regex + def text(regex: str): + if not re.fullmatch(regex, element.text): + raise ValidationError(f"<{element.tag}> text '{element.text}' does not match r'{regex}'") + + # Check requirements for each tag + match element.tag: + case "tokens": + children(r"(|)+") + + case "two-byte": + attributes({"value": r"\$[0-9A-F]{2}"}) + children(r"()+") + + case "token": + attributes({"value": r"\$[0-9A-F]{2}"}) + children(r"()+") + + if byte in all_tokens: + raise ValidationError("token byte must be unique") + + all_tokens.add(byte) + + case "version": + version = OsVersions.INITIAL + children(r"()?()+") + + case "since": + if not for_73: + if (this_version := OsVersion.from_element(element)) < version: + raise ValidationError(f"version {this_version} overlaps with {version}") + + version = this_version + + # Workaround for nested defaultdict + all_names[version] = all_names.get(version, defaultdict(set)) + + children(r"") + + case "until": + children(r"") + + case "lang": + attributes({"code": r"[a-z]{2}"} if for_73 else {"code": r"[a-z]{2}", "ti-ascii": r"([0-9A-F]{2})+"}) + children(r"" if for_73 else r"()*") + + case "name" if for_73: + text(r"[\S\s]+") + + case "display": + text(r"[\S\s]+") + + case "accessible": + text(r"[\u0000-\u00FF]*") + + if element.text in all_names[version][lang]: + raise ValidationError(f"{lang} accessible name '{element.text}' is not unique within {version}") + + all_names[version][lang].add(element.text) + + case "variant": + text(r".+") + + if element.text in all_names[version][lang]: + raise ValidationError(f"{lang} variant name '{element.text}' is not unique within {version}") + + all_names[version][lang].add(element.text) + + case "model": + text(r"TI-\d\d.*") + + case "os-version": + text(r"(\d+\.)+\d+") + + case _: + raise ValidationError(f"unrecognized tag <{element.tag}>") + + # Visit children + for child in element: + visit(child, byte, lang) + + visit(root) + return len(all_tokens) + + +def to_json(element: ET.Element): + """ + Converts a token sheet to an equivalent JSON representation + + :param element: An XML element; call on the root element to convert the entire sheet + :return: The element and all its descendants as JSON + """ + + match element.tag: + case "tokens" | "two-byte": + return {child.attrib["value"]: to_json(child) for child in element} + + case "token": + return [to_json(child) for child in element] + + case "version": + dct = {} + langs = {} + + for child in element: + if child.tag == "lang": + langs[child.attrib["code"]] = to_json(child) + + else: + dct[child.tag] = to_json(child) + + return dct | {"langs": langs} + + case "lang": + dct = {"ti-ascii": element.attrib["ti-ascii"]} + variants = [] + + for child in element: + if child.tag == "variant": + variants.append(child.text) + + else: + dct[child.tag] = child.text + + if variants: + return dct | {"variants": variants} + + else: + return dct + + case _: + if list(element): + return {child.tag: to_json(child) for child in element} + + else: + return element.text + + +# with open("../8X.xml", encoding="UTF-8") as file: +# json.dumps(to_json(ET.fromstring(file.read())), indent=2) + + +__all__ = ["to_json", "validate"] diff --git a/scripts/parse.py b/scripts/parse.py index 425d048..398979a 100644 --- a/scripts/parse.py +++ b/scripts/parse.py @@ -42,7 +42,7 @@ @functools.total_ordering -@dataclass +@dataclass(frozen=True) class OsVersion: """ Data class for defining and comparing OS versions