Skip to content

Commit

Permalink
[doctools refactor] Use token IDs in in the new htm8.asdl
Browse files Browse the repository at this point in the history
Update benchmarks2 job to build _devbuild.gen.htm8_asdl
  • Loading branch information
Andy C committed Jan 14, 2025
1 parent 30ee4f3 commit 6f81c12
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 160 deletions.
15 changes: 15 additions & 0 deletions devtools/refactor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -292,4 +292,19 @@ singleton-primitive() {
echo
}

htm8() {
for prefix in Tok html; do
for name in \
Decl Comment CommentBegin Processing ProcessingBegin \
CData CDataBegin \
StartTag StartEndTag EndTag \
DecChar HexChar CharEntity \
RawData HtmlCData \
BadAmpersand BadGreaterThan BadLessThan \
Invalid EndOfStream; do
sed -i "s/$prefix.$name/h8_id.$name/g" */*.py
done
done
}

task-five "$@"
9 changes: 7 additions & 2 deletions doctools/cmark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#!/usr/bin/env python2
"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
and insert anchors.
"""Convert Markdown to HTML, with our enhancements
- Parse the HTML
- insert a TOC
- <pstrip> hack - this is obsolete with ul-table?
- Expand $xref links
- Highlight code blocks
I started from cmark-0.28.3/wrappers/wrapper.py.
"""
Expand Down
5 changes: 3 additions & 2 deletions doctools/help_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import re
import sys

from _devbuild.gen.htm8_asdl import h8_id
from doctools import html_lib
from doctools.util import log
from lazylex import html
Expand Down Expand Up @@ -309,7 +310,7 @@ def ExtractBody(s):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:
tag_lexer.Reset(pos, end_pos)
if tag_lexer.TagName() == 'body':
body_start_right = end_pos # right after <body>
Expand Down Expand Up @@ -364,7 +365,7 @@ def HelpTopics(s):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:
tag_lexer.Reset(pos, end_pos)
#log('%r', tag_lexer.TagString())
#log('%r', tag_lexer.TagName())
Expand Down
16 changes: 9 additions & 7 deletions doctools/oils_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"""
from __future__ import print_function

from _devbuild.gen.htm8_asdl import h8_id

import cgi
from typing import Iterator
from typing import Any
Expand Down Expand Up @@ -121,7 +123,7 @@ def ExpandLinks(s):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:

tag_lexer.Reset(pos, end_pos)
if tag_lexer.TagName() == 'a':
Expand Down Expand Up @@ -343,7 +345,7 @@ def SimpleHighlightCode(s):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:

tag_lexer.Reset(pos, end_pos)
if tag_lexer.TagName() == 'pre':
Expand Down Expand Up @@ -403,7 +405,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:

tag_lexer.Reset(pos, end_pos)
if tag_lexer.TagName() == 'pre':
Expand All @@ -416,7 +418,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
break

tag_lexer.Reset(pos, end_pos)
if tok_id == html.StartTag and tag_lexer.TagName() == 'code':
if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code':

css_class = tag_lexer.GetAttrRaw('class')
code_start_pos = end_pos
Expand Down Expand Up @@ -514,7 +516,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
except StopIteration:
break
tag_lexer.Reset(slash_code_right, end_pos)
assert tok_id == html.EndTag, tok_id
assert tok_id == h8_id.EndTag, tok_id
assert (tag_lexer.TagName() == 'pre'
), tag_lexer.TagName()
slash_pre_right = end_pos
Expand Down Expand Up @@ -559,7 +561,7 @@ def ExtractCode(s, f):
except StopIteration:
break

if tok_id == html.StartTag:
if tok_id == h8_id.StartTag:
tag_lexer.Reset(pos, end_pos)
if tag_lexer.TagName() == 'pre':
pre_start_pos = pos
Expand All @@ -571,7 +573,7 @@ def ExtractCode(s, f):
break

tag_lexer.Reset(pos, end_pos)
if tok_id == html.StartTag and tag_lexer.TagName() == 'code':
if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code':

css_class = tag_lexer.GetAttrRaw('class')
# Skip code blocks that look like ```foo
Expand Down
66 changes: 34 additions & 32 deletions doctools/ul_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python2
"""ul_table.py: Markdown Tables Without New Syntax."""

from _devbuild.gen.htm8_asdl import h8_id, h8_id_str

try:
from cStringIO import StringIO
except ImportError:
Expand Down Expand Up @@ -32,7 +34,7 @@ def RemoveComments(s):
pos = 0

for tok_id, end_pos in html.ValidTokens(s):
if tok_id == html.Comment:
if tok_id == h8_id.Comment:
value = s[pos:end_pos]
# doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
if 'REPLACE' not in value:
Expand All @@ -54,7 +56,7 @@ def __init__(self, lexer, tag_lexer):
self.lexer = lexer
self.tag_lexer = tag_lexer

self.tok_id = html.Invalid
self.tok_id = h8_id.Invalid
self.start_pos = 0
self.end_pos = 0

Expand All @@ -73,7 +75,7 @@ def _Next(self, comment_ok=False):

# Should have called RemoveComments() beforehand. That can still leave
# some REPLACE cmoments
if not comment_ok and self.tok_id == html.Comment:
if not comment_ok and self.tok_id == h8_id.Comment:
raise html.ParseError('Unexpected HTML comment')

if 0:
Expand All @@ -85,9 +87,9 @@ def _EatRawData(self, regex):
"""
Assert that we got text data matching a regex, and advance
"""
if self.tok_id != html.RawData:
if self.tok_id != h8_id.RawData:
raise html.ParseError('Expected RawData, got %s' %
html.TokenName(self.tok_id))
h8_id_str(self.tok_id))
actual = self._CurrentString()
m = re.match(regex, actual) # could compile this
if m is None:
Expand All @@ -101,16 +103,16 @@ def _Eat(self, expected_id, expected_tag):
Assert that we got a start or end tag, with the given name, and advance
Args:
expected_id: html.StartTag or html.EndTag
expected_id: h8_id.StartTag or h8_id.EndTag
expected_tag: 'a', 'span', etc.
"""
assert expected_id in (html.StartTag,
html.EndTag), html.TokenName(expected_id)
assert expected_id in (h8_id.StartTag,
h8_id.EndTag), h8_id_str(expected_id)

if self.tok_id != expected_id:
raise html.ParseError(
'Expected token %s, got %s' %
(html.TokenName(expected_id), html.TokenName(self.tok_id)))
(h8_id_str(expected_id), h8_id_str(self.tok_id)))
self.tag_lexer.Reset(self.start_pos, self.end_pos)
tag_name = self.tag_lexer.TagName()
if expected_tag != tag_name:
Expand All @@ -124,7 +126,7 @@ def _WhitespaceOk(self):
"""
Optional whitespace
"""
if (self.tok_id == html.RawData and
if (self.tok_id == h8_id.RawData and
_WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
self._Next()

Expand All @@ -140,19 +142,19 @@ def FindUlTable(self):
# Find first table
while True:
self._Next(comment_ok=True)
if self.tok_id == html.EndOfStream:
if self.tok_id == h8_id.EndOfStream:
return -1

tag_lexer.Reset(self.start_pos, self.end_pos)
if (self.tok_id == html.StartTag and
if (self.tok_id == h8_id.StartTag and
tag_lexer.TagName() == 'table'):
while True:
self._Next(comment_ok=True)
if self.tok_id != html.RawData:
if self.tok_id != h8_id.RawData:
break

tag_lexer.Reset(self.start_pos, self.end_pos)
if (self.tok_id == html.StartTag and
if (self.tok_id == h8_id.StartTag and
tag_lexer.TagName() == 'ul'):
return self.start_pos
return -1
Expand Down Expand Up @@ -186,14 +188,14 @@ def _ListItem(self):
"""
self._WhitespaceOk()

if self.tok_id != html.StartTag:
if self.tok_id != h8_id.StartTag:
return None, None

inner_html = None
td_attrs = None # Can we also have col-attrs?
td_attrs_span = None

self._Eat(html.StartTag, 'li')
self._Eat(h8_id.StartTag, 'li')

left = self.start_pos

Expand All @@ -202,7 +204,7 @@ def _ListItem(self):
# because cells can have bulleted lists
balance = 0
while True:
if self.tok_id == html.StartEndTag:
if self.tok_id == h8_id.StartEndTag:
self.tag_lexer.Reset(self.start_pos, self.end_pos)
tag_name = self.tag_lexer.TagName()
# TODO: remove td-attrs backward compat
Expand All @@ -211,12 +213,12 @@ def _ListItem(self):
td_attrs = self.tag_lexer.AllAttrsRaw()
#log('CELL ATTRS %r', self._CurrentString())

elif self.tok_id == html.StartTag:
elif self.tok_id == h8_id.StartTag:
self.tag_lexer.Reset(self.start_pos, self.end_pos)
if self.tag_lexer.TagName() == 'li':
balance += 1

elif self.tok_id == html.EndTag:
elif self.tok_id == h8_id.EndTag:
self.tag_lexer.Reset(self.start_pos, self.end_pos)
if self.tag_lexer.TagName() == 'li':
balance -= 1
Expand All @@ -236,7 +238,7 @@ def _ListItem(self):
inner_html = s[left:right]
#log('RAW inner html %r', inner_html)

#self._Eat(html.EndTag, 'li')
#self._Eat(h8_id.EndTag, 'li')
self._Next()

return td_attrs, inner_html
Expand Down Expand Up @@ -284,15 +286,15 @@ def _ParseTHead(self):
cells = []

self._WhitespaceOk()
self._Eat(html.StartTag, 'li')
self._Eat(h8_id.StartTag, 'li')

# In CommonMark, r'thead\n' is enough, because it strips trailing
# whitespace. I'm not sure if other Markdown processors do that, so
# use r'thead\s+'.
self._EatRawData(r'thead\s+')

# This is the row data
self._Eat(html.StartTag, 'ul')
self._Eat(h8_id.StartTag, 'ul')

while True:
td_attrs, inner_html = self._ListItem()
Expand All @@ -301,10 +303,10 @@ def _ParseTHead(self):
cells.append((td_attrs, inner_html))
self._WhitespaceOk()

self._Eat(html.EndTag, 'ul')
self._Eat(h8_id.EndTag, 'ul')

self._WhitespaceOk()
self._Eat(html.EndTag, 'li')
self._Eat(h8_id.EndTag, 'li')

#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
return cells
Expand Down Expand Up @@ -334,15 +336,15 @@ def _ParseTr(self):
self._WhitespaceOk()

# Could be a </ul>
if self.tok_id != html.StartTag:
if self.tok_id != h8_id.StartTag:
return None, None

self._Eat(html.StartTag, 'li')
self._Eat(h8_id.StartTag, 'li')

self._EatRawData(r'tr\s*')

tr_attrs = None
if self.tok_id == html.StartEndTag:
if self.tok_id == h8_id.StartEndTag:
self.tag_lexer.Reset(self.start_pos, self.end_pos)
tag_name = self.tag_lexer.TagName()
if tag_name != 'row-attrs':
Expand All @@ -352,7 +354,7 @@ def _ParseTr(self):
self._WhitespaceOk()

# This is the row data
self._Eat(html.StartTag, 'ul')
self._Eat(h8_id.StartTag, 'ul')

while True:
td_attrs, inner_html = self._ListItem()
Expand All @@ -363,10 +365,10 @@ def _ParseTr(self):

self._WhitespaceOk()

self._Eat(html.EndTag, 'ul')
self._Eat(h8_id.EndTag, 'ul')

self._WhitespaceOk()
self._Eat(html.EndTag, 'li')
self._Eat(h8_id.EndTag, 'li')

#log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
return tr_attrs, cells
Expand Down Expand Up @@ -394,7 +396,7 @@ def ParseTable(self):
table = {'tr': []}

ul_start = self.start_pos
self._Eat(html.StartTag, 'ul')
self._Eat(h8_id.StartTag, 'ul')

# Look ahead 2 or 3 tokens:
if self.lexer.LookAhead(r'\s*<li>thead\s+'):
Expand All @@ -416,7 +418,7 @@ def ParseTable(self):
#log('___ TR %s', tr)
table['tr'].append((tr_attrs, tr))

self._Eat(html.EndTag, 'ul')
self._Eat(h8_id.EndTag, 'ul')

self._WhitespaceOk()

Expand Down
Loading

0 comments on commit 6f81c12

Please sign in to comment.