doctype: parse all child nodes #83

milahu · 2024-02-21T12:15:24Z

input

<!doctype html><hr>

result: compound nodes are prefixed with #

# node 25 = fragment: '<!doctype html><hr>'
# node 26 = doctype: '<!doctype html>'
node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'
# node 28 = element: '<hr>'
# node 31 = start_tag: '<hr>'
node 5 = <: '<' -> '<'
node 17 = tag_name: 'hr' -> 'hr'
node 3 = >: '>' -> '>'

problem: the 'html' in '<!doctype html>' has no parse node
and the close tag '>' of '<!doctype html>'
has the same node type as the close tag '>' of '<hr>'

note how ' html' spills into '>'
with node_source = input_html[last_node_to:node.range.end_byte]

node 1 = <!: '<!' -> '<!'
node 4 = doctype: 'doctype' -> 'doctype'
node 3 = >: '>' -> ' html>'

this is causing problems in a semantic stage using this parser
where i want to ...

either ignore the compound node '<!doctype html>'
and process its child nodes '<!' and 'doctype' and 'html' and '>'

or process the compound node and ignore its child nodes

the cheap solution would be
to use a different node type for '>' of '<!doctype html>'

# https://github.com/tree-sitter/py-tree-sitter/issues/33
#def traverse_tree(tree: Tree):
def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0

input_html = """<!doctype html><hr>"""

def walk_callback(node, is_compound):
    nonlocal walk_html_tree_test_result, last_node_to

    s = repr(node.text.decode("utf8"))
    if len(s) > 50:
        s = s[0:50] + "..."

    if not is_compound:
        node_source = input_html[last_node_to:node.range.end_byte]
        last_node_to = node.range.end_byte
        node_source = node_source.decode("utf8")
        if len(node_source) > 50:
            node_source = node_source[0:50] + "..."
        print(f"node {node.kind_id} = {node.type}: {s} -> {repr(node_source)}")
    else:
        print(f"# node {node.kind_id} = {node.type}: {s}")

import tree_sitter
import tree_sitter_languages

tree_sitter_html = tree_sitter_languages.get_parser("html")
html_parser = tree_sitter_html

html_tree = html_parser.parse(input_html)
top_node = html_tree.root_node

walk_html_tree(top_node, walk_callback)

The text was updated successfully, but these errors were encountered:

amaanq · 2024-02-21T14:14:47Z

does the is_named property not help you here?

milahu · 2024-02-21T18:19:09Z

no, this does not help to diff the '>'

  node kind_id= 3 type=>          is_named=False '>'                       ' html>'
  node kind_id= 3 type=>          is_named=False '>'                       '>'

# node kind_id=25 type=fragment   is_named=True  '<!doctype html><hr>'    
# node kind_id=26 type=doctype    is_named=True  '<!doctype html>'        
  node kind_id= 1 type=<!         is_named=False '<!'                      '<!'
  node kind_id= 4 type=doctype    is_named=False 'doctype'                 'doctype'
  node kind_id= 3 type=>          is_named=False '>'                       ' html>'
# node kind_id=28 type=element    is_named=True  '<hr>'                   
# node kind_id=31 type=start_tag  is_named=True  '<hr>'                   
  node kind_id= 5 type=<          is_named=False '<'                       '<'
  node kind_id=17 type=tag_name   is_named=True  'hr'                      'hr'
  node kind_id= 3 type=>          is_named=False '>'                       '>'

def walk_callback(node, is_compound):
    nonlocal last_node_to

    s = repr(node.text.decode("utf8"))
    if len(s) > 50:
        s = s[0:50] + "..."

    if not is_compound:
        node_source = input_html[last_node_to:node.range.end_byte]
        last_node_to = node.range.end_byte
        walk_html_tree_test_result += node_source
        node_source = node_source.decode("utf8")
        if len(node_source) > 50:
            node_source = node_source[0:50] + "..."
        print(f"  node kind_id={node.kind_id:2d} type={node.type:10s} is_named={str(node.is_named):5s} {s:25s} {repr(node_source)}")
    else:
        print(f"# node kind_id={node.kind_id:2d} type={node.type:10s} is_named={str(node.is_named):5s} {s:25s}")

currently i use the workaround

if node_type_id == 1 or node_type_id == 4:
    in_doctype_node = True
elif node_type_id == 3 and in_doctype_node == True:
    in_doctype_node = False
    node_source = input_html[(last_node_to + 1):node.range.end_byte]
    node_source_space_before = b""
    last_node_to = node.range.start_byte - 1

low priority stuff...
im just surprised that the contents of '<!doctype html>' dont show up in the parse tree

to compare:
lezer-parser-html produces only one node for '<!doctype html>'

node 15 = Document: '<!doctype html><hr>'
node 43 = DoctypeDecl: '<!doctype html>'
node 20 = Element: '<hr>'
node 38 = SelfClosingTag: '<hr>'
node 10 = StartTag: '<'
node 22 = TagName: 'hr'
node 4 = EndTag: '>'

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

doctype: parse all child nodes #83

doctype: parse all child nodes #83

milahu commented Feb 21, 2024 •

edited

Loading

amaanq commented Feb 21, 2024

milahu commented Feb 21, 2024 •

edited

Loading

doctype: parse all child nodes #83

doctype: parse all child nodes #83

Comments

milahu commented Feb 21, 2024 • edited Loading

amaanq commented Feb 21, 2024

milahu commented Feb 21, 2024 • edited Loading

milahu commented Feb 21, 2024 •

edited

Loading

milahu commented Feb 21, 2024 •

edited

Loading