Skip to content

Commit

Permalink
Merge pull request #62 from microsoft/dev/t-schn/v0.3rc2
Browse files Browse the repository at this point in the history
- added first draft of markdown express parser
  • Loading branch information
t-schn authored Nov 1, 2024
2 parents ad803a7 + 7bcc665 commit e9fddc3
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sammo"
version = "0.3.0.rc1"
version = "0.3.0.rc2"
description = "A flexible, easy-to-use library for running and optimizing prompts for Large Language Models (LLMs)."
authors = ["Tobias Schnabel"]
license = "MIT"
Expand Down Expand Up @@ -34,6 +34,8 @@ dill = "^0.3"
quattro = "^24"
async-timeout = "^4.0.3"
lxml = "^5.3"
cssselect = "^1.2"
mistletoe = "^1.4"

[tool.poetry.extras]
parser = ["benepar"]
Expand Down
132 changes: 132 additions & 0 deletions sammo/express.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import json
import re
from collections import namedtuple
from mistletoe.block_token import List
from mistletoe.markdown_renderer import MarkdownRenderer

from mistletoe import Document, block_token
from sammo.instructions import MetaPrompt, Section, Paragraph

HTML_COMMENT = re.compile(r"<!--(.*?)-->")
HTML_IDS = re.compile(r"#(\w+)($|\s)")
HTML_CLASSES = re.compile(r"\.([\w-]+)($|\s)")


def _extract_html_comment(text):
rest = text
inner_comment = ""

if HTML_COMMENT.search(text) is not None:
inner_comment = HTML_COMMENT.search(text).group(1)
rest = HTML_COMMENT.sub("", text)

return inner_comment, rest


def _get_ids_and_classes(text):
comment, rest = _extract_html_comment(text)
ids = HTML_IDS.findall(comment) or list()
ids = [i[0] for i in ids]

classes = HTML_CLASSES.findall(comment) or list()
classes = [c[0] for c in classes]

return {"text": rest, "ids": ids, "classes": classes}


class ExpressParser:
def __init__(self, input_text: str):
aux_tree, config = self._parse_annotated_markdown(input_text)
self.parsed_config = config
self.parsed_tree = self._aux_tree_to_sammo(aux_tree)

@staticmethod
def from_file(file_path):
with open(file_path, "r", encoding="utf-8") as file:
return ExpressParser(file.read())

@staticmethod
def _parse_annotated_markdown(text):
doc = Document(text)
sammo_config = dict()
State = namedtuple("State", ["current", "parent", "level"])
with MarkdownRenderer() as mrender:
processed = list()
stack = [State(processed, processed, 0)]
for element in doc.children:
last = stack[-1]
if isinstance(element, List):
list_elements = list()
classes = set()
ids = set()

for c in element.children:
d = _get_ids_and_classes(mrender.render(c))
classes.update(d["classes"])
ids.update(d["ids"])
list_elements.append([d["text"]])

last.current.append(
{"type": "list", "children": list_elements, "class": list(classes), "id": list(ids)}
)
elif isinstance(element, block_token.Heading):
d = _get_ids_and_classes(mrender.render(element))
new = {
"type": "section",
"title": d["text"],
"children": list(),
"id": d["ids"],
"class": d["classes"],
}
if element.level < last.level:
while stack[-1].level >= element.level:
stack.pop()
scope = stack[-1].current
elif element.level == last.level:
scope = last.parent
else:
scope = last.current
stack.append(State(new["children"], scope, element.level))
scope.append(new)
elif isinstance(element, block_token.CodeFence) and element.language.lower() == "{sammo/mutators}":
sammo_config = json.loads(element.children[0].content)
else:
last.current.append(
{"type": element.__class__.__name__.lower(), "children": [mrender.render(element)]}
)
return {"type": "root", "children": processed}, sammo_config

@classmethod
def _aux_tree_to_sammo(cls, node):
def _empty_to_none(x):
return None if len(x) == 0 else x

def _unwrap_list(x):
if not isinstance(x, list) or len(x) > 1:
return ValueError(f"Expected list of length 0 or 1, got {len(x)}")
elif len(x) == 1:
return x[0]
return x

def _get_annotations(x):
return dict(
reference_id=_empty_to_none(_unwrap_list(x.get("id", []))),
reference_classes=_empty_to_none(x.get("class", [])),
)

if not isinstance(node, dict):
return node
elif node["type"] == "root":
return MetaPrompt([cls._aux_tree_to_sammo(child) for child in node["children"]])
elif node["type"] == "section":
return Section(
title=node["title"],
content=[cls._aux_tree_to_sammo(child) for child in node["children"]],
**_get_annotations(node),
)
elif node["type"] in ["paragraph", "list", "blockcode", "codefence", "quote"]:
return Paragraph(content=node["children"], **_get_annotations(node))
elif isinstance(node, (str, int, float)):
return node
else:
raise ValueError(f"Unsupported type: {type(node)} with node: {node}")
127 changes: 127 additions & 0 deletions sammo/express_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import textwrap
from sammo.express import ExpressParser, _extract_html_comment, _get_ids_and_classes

COMPLEX = """
# Heading 1 <!-- .header -->
A long trip.
* list **item** 1 <!-- #id1 #id2 .class1 .class2 -->
* list item 2
## Heading 1.2
A short trip
## Heading 1.3
# Heading 2
Another long trip.
```{sammo/mutators}
{
"mutators": [
{
"name": "mutator1",
"type": "type1"
},
{
"name": "mutator2",
"type": "type2"
}
]
}
```
## Heading 2.1
And so **on**.
# Heading 3
[ref](https://www.google.com)
"""


def test_extract_html_comment():
text = "Some text <!-- This is a comment -->more text"
comment, rest = _extract_html_comment(text)
assert comment == " This is a comment "
assert rest == "Some text more text"


def test_extract_html_comment_no_comment():
text = "Some text more text"
comment, rest = _extract_html_comment(text)
assert comment == ""
assert rest == text


def test_get_ids_and_classes():
text = "Some text <!-- #id1 .class1 #id2 .class2 --> more text"
result = _get_ids_and_classes(text)
assert result["text"] == "Some text more text"
assert set(result["ids"]) == {"id1", "id2"}
assert set(result["classes"]) == {"class1", "class2"}


def test_get_ids_and_classes_no_comment():
text = "Some text more text"
result = _get_ids_and_classes(text)
assert result["text"] == text
assert result["ids"] == []
assert result["classes"] == []


def test_express_parser_parse_annotated_markdown():
input_text = textwrap.dedent(
"""
# Heading 1
Some content
* list item 1 <!-- #id1 .class1 -->
* list item 2
"""
)
parser = ExpressParser(input_text)
assert parser.parsed_tree is not None
assert parser.parsed_config == {}


def test_express_parser_aux_tree_to_sammo():
input_text = textwrap.dedent(
"""
# Heading 1
Some content
```{python}
print("Hello, World!")
```
"""
)
parser = ExpressParser(input_text)
sammo_tree = parser._aux_tree_to_sammo(parser.parsed_tree)
assert sammo_tree is not None


def test_express_parser_with_mutators():
input_text = textwrap.dedent(
"""
# Heading 1
Some content
> Somewhere, something incredible is waiting to be known
```{sammo/mutators}
{
"mutators": [
{
"name": "mutator1",
"type": "type1"
},
{
"name": "mutator2",
"type": "type2"
}
]
}
```
"""
)
parser = ExpressParser(input_text)
assert parser.parsed_config == {
"mutators": [{"name": "mutator1", "type": "type1"}, {"name": "mutator2", "type": "type2"}]
}

0 comments on commit e9fddc3

Please sign in to comment.