Merge pull request #62 from microsoft/dev/t-schn/v0.3rc2

- added first draft of markdown express parser
microsoft · Nov 1, 2024 · e9fddc3 · e9fddc3
2 parents ad803a7 + 7bcc665
commit e9fddc3
Show file tree

Hide file tree

Showing 3 changed files with 262 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sammo"
-version = "0.3.0.rc1"
+version = "0.3.0.rc2"
 description = "A flexible, easy-to-use library for running and optimizing prompts for Large Language Models (LLMs)."
 authors = ["Tobias Schnabel"]
 license = "MIT"
@@ -34,6 +34,8 @@ dill = "^0.3"
 quattro = "^24"
 async-timeout = "^4.0.3"
 lxml = "^5.3"
+cssselect = "^1.2"
+mistletoe = "^1.4"
 
 [tool.poetry.extras]
 parser = ["benepar"]

diff --git a/sammo/express.py b/sammo/express.py
@@ -0,0 +1,132 @@
+import json
+import re
+from collections import namedtuple
+from mistletoe.block_token import List
+from mistletoe.markdown_renderer import MarkdownRenderer
+
+from mistletoe import Document, block_token
+from sammo.instructions import MetaPrompt, Section, Paragraph
+
+HTML_COMMENT = re.compile(r"<!--(.*?)-->")
+HTML_IDS = re.compile(r"#(\w+)($|\s)")
+HTML_CLASSES = re.compile(r"\.([\w-]+)($|\s)")
+
+
+def _extract_html_comment(text):
+    rest = text
+    inner_comment = ""
+
+    if HTML_COMMENT.search(text) is not None:
+        inner_comment = HTML_COMMENT.search(text).group(1)
+        rest = HTML_COMMENT.sub("", text)
+
+    return inner_comment, rest
+
+
+def _get_ids_and_classes(text):
+    comment, rest = _extract_html_comment(text)
+    ids = HTML_IDS.findall(comment) or list()
+    ids = [i[0] for i in ids]
+
+    classes = HTML_CLASSES.findall(comment) or list()
+    classes = [c[0] for c in classes]
+
+    return {"text": rest, "ids": ids, "classes": classes}
+
+
+class ExpressParser:
+    def __init__(self, input_text: str):
+        aux_tree, config = self._parse_annotated_markdown(input_text)
+        self.parsed_config = config
+        self.parsed_tree = self._aux_tree_to_sammo(aux_tree)
+
+    @staticmethod
+    def from_file(file_path):
+        with open(file_path, "r", encoding="utf-8") as file:
+            return ExpressParser(file.read())
+
+    @staticmethod
+    def _parse_annotated_markdown(text):
+        doc = Document(text)
+        sammo_config = dict()
+        State = namedtuple("State", ["current", "parent", "level"])
+        with MarkdownRenderer() as mrender:
+            processed = list()
+            stack = [State(processed, processed, 0)]
+            for element in doc.children:
+                last = stack[-1]
+                if isinstance(element, List):
+                    list_elements = list()
+                    classes = set()
+                    ids = set()
+
+                    for c in element.children:
+                        d = _get_ids_and_classes(mrender.render(c))
+                        classes.update(d["classes"])
+                        ids.update(d["ids"])
+                        list_elements.append([d["text"]])
+
+                    last.current.append(
+                        {"type": "list", "children": list_elements, "class": list(classes), "id": list(ids)}
+                    )
+                elif isinstance(element, block_token.Heading):
+                    d = _get_ids_and_classes(mrender.render(element))
+                    new = {
+                        "type": "section",
+                        "title": d["text"],
+                        "children": list(),
+                        "id": d["ids"],
+                        "class": d["classes"],
+                    }
+                    if element.level < last.level:
+                        while stack[-1].level >= element.level:
+                            stack.pop()
+                        scope = stack[-1].current
+                    elif element.level == last.level:
+                        scope = last.parent
+                    else:
+                        scope = last.current
+                    stack.append(State(new["children"], scope, element.level))
+                    scope.append(new)
+                elif isinstance(element, block_token.CodeFence) and element.language.lower() == "{sammo/mutators}":
+                    sammo_config = json.loads(element.children[0].content)
+                else:
+                    last.current.append(
+                        {"type": element.__class__.__name__.lower(), "children": [mrender.render(element)]}
+                    )
+        return {"type": "root", "children": processed}, sammo_config
+
+    @classmethod
+    def _aux_tree_to_sammo(cls, node):
+        def _empty_to_none(x):
+            return None if len(x) == 0 else x
+
+        def _unwrap_list(x):
+            if not isinstance(x, list) or len(x) > 1:
+                return ValueError(f"Expected list of length 0 or 1, got {len(x)}")
+            elif len(x) == 1:
+                return x[0]
+            return x
+
+        def _get_annotations(x):
+            return dict(
+                reference_id=_empty_to_none(_unwrap_list(x.get("id", []))),
+                reference_classes=_empty_to_none(x.get("class", [])),
+            )
+
+        if not isinstance(node, dict):
+            return node
+        elif node["type"] == "root":
+            return MetaPrompt([cls._aux_tree_to_sammo(child) for child in node["children"]])
+        elif node["type"] == "section":
+            return Section(
+                title=node["title"],
+                content=[cls._aux_tree_to_sammo(child) for child in node["children"]],
+                **_get_annotations(node),
+            )
+        elif node["type"] in ["paragraph", "list", "blockcode", "codefence", "quote"]:
+            return Paragraph(content=node["children"], **_get_annotations(node))
+        elif isinstance(node, (str, int, float)):
+            return node
+        else:
+            raise ValueError(f"Unsupported type: {type(node)} with node: {node}")
diff --git a/sammo/express_test.py b/sammo/express_test.py
@@ -0,0 +1,127 @@
+import textwrap
+from sammo.express import ExpressParser, _extract_html_comment, _get_ids_and_classes
+
+COMPLEX = """
+# Heading 1  <!-- .header -->
+A long trip.
+
+* list **item** 1 <!-- #id1 #id2 .class1 .class2 -->
+* list item 2
+
+## Heading 1.2
+A short trip
+
+## Heading 1.3
+
+# Heading 2
+Another long trip.
+
+```{sammo/mutators}
+{
+  "mutators": [
+    {
+      "name": "mutator1",
+      "type": "type1"
+    },
+    {
+      "name": "mutator2",
+      "type": "type2"
+    }
+  ]
+}
+```
+
+## Heading 2.1
+And so **on**.
+
+# Heading 3
+[ref](https://www.google.com)
+"""
+
+
+def test_extract_html_comment():
+    text = "Some text <!-- This is a comment -->more text"
+    comment, rest = _extract_html_comment(text)
+    assert comment == " This is a comment "
+    assert rest == "Some text more text"
+
+
+def test_extract_html_comment_no_comment():
+    text = "Some text more text"
+    comment, rest = _extract_html_comment(text)
+    assert comment == ""
+    assert rest == text
+
+
+def test_get_ids_and_classes():
+    text = "Some text <!-- #id1 .class1 #id2 .class2 --> more text"
+    result = _get_ids_and_classes(text)
+    assert result["text"] == "Some text  more text"
+    assert set(result["ids"]) == {"id1", "id2"}
+    assert set(result["classes"]) == {"class1", "class2"}
+
+
+def test_get_ids_and_classes_no_comment():
+    text = "Some text more text"
+    result = _get_ids_and_classes(text)
+    assert result["text"] == text
+    assert result["ids"] == []
+    assert result["classes"] == []
+
+
+def test_express_parser_parse_annotated_markdown():
+    input_text = textwrap.dedent(
+        """
+    # Heading 1
+    Some content
+    * list item 1 <!-- #id1 .class1 -->
+    * list item 2
+    """
+    )
+    parser = ExpressParser(input_text)
+    assert parser.parsed_tree is not None
+    assert parser.parsed_config == {}
+
+
+def test_express_parser_aux_tree_to_sammo():
+    input_text = textwrap.dedent(
+        """
+    # Heading 1
+    Some content
+    ```{python}
+    print("Hello, World!")
+    ```
+    """
+    )
+    parser = ExpressParser(input_text)
+    sammo_tree = parser._aux_tree_to_sammo(parser.parsed_tree)
+    assert sammo_tree is not None
+
+
+def test_express_parser_with_mutators():
+    input_text = textwrap.dedent(
+        """
+    # Heading 1
+    Some content
+    > Somewhere, something incredible is waiting to be known
+
+    ```{sammo/mutators}
+    {
+      "mutators": [
+        {
+          "name": "mutator1",
+          "type": "type1"
+        },
+        {
+          "name": "mutator2",
+          "type": "type2"
+        }
+      ]
+    }
+    ```
+    """
+    )
+    parser = ExpressParser(input_text)
+    assert parser.parsed_config == {
+        "mutators": [{"name": "mutator1", "type": "type1"}, {"name": "mutator2", "type": "type2"}]
+    }