Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

leading and trailing whitespace is lost when it should be part of text nodes #87

Open
milahu opened this issue Mar 2, 2024 · 0 comments · May be fixed by #89 or #108
Open

leading and trailing whitespace is lost when it should be part of text nodes #87

milahu opened this issue Mar 2, 2024 · 0 comments · May be fixed by #89 or #108

Comments

@milahu
Copy link

milahu commented Mar 2, 2024

whitespace is significant for lossless AST transformers

when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node

similar #40

In the specific case of HTML parsing, I couldn't tell if or when it's reasonable to treat leading and trailing whitespace as significant.

test.html


<div>

  aaa

  bbb

</div>

the right source column has lookbehind source plus node.text
which is easy to do with node.range.end_byte of the previous node

lookahead would be more complex...

tree-sitter-html

node  5 = <               : "<"                            : "\n<"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"
node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
node  7 = </              : "</"                           : "\n\n</"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"

lezer-parser-html

note how both source columns are identical
so this is a truly "lossless" parser (CST parser)

node 16 = Text            : "\n"                           : "\n"
node  6 = StartTag        : "<"                            : "<"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
node 11 = StartCloseTag   : "</"                           : "</"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n"                           : "\n"

diff

+ node 16 = Text            : "\n"                           : "\n"
- node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
+ node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
+ node 16 = Text            : "\n"                           : "\n"
repro.py
#!/usr/bin/env python3

# pip install tree-sitter tree-sitter-languages

import json
import tree_sitter
import tree_sitter_languages

input_html_bytes = b"""
<div>

  aaa

  bbb

</div>
"""

def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        #yield cursor.node
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0
node_idx = -1

max_len = 30

show_compound_nodes = False

def walk_callback_test(node, is_compound):
    global node_idx
    global last_node_to

    node_text = json.dumps(node.text.decode("utf8"))
    if len(node_text) > max_len:
        node_text = node_text[0:max_len] + "..."

    if not is_compound:
        space_node_text = json.dumps(input_html_bytes[last_node_to:node.range.end_byte].decode("utf8"))
        if len(space_node_text) > max_len:
            space_node_text = space_node_text[0:max_len] + "..."
        line_prefix = "  " if show_compound_nodes else ""
        print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
        last_node_to = node.range.end_byte
    else:
        if show_compound_nodes:
            line_prefix = "# "
            print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")

    node_idx += 1
    #if node_idx > 20: raise "todo"

tree_sitter_html = tree_sitter_languages.get_parser("html")

html_tree = tree_sitter_html.parse(input_html_bytes)

walk_html_tree(html_tree.root_node, walk_callback_test)
repro.js
#!/usr/bin/env node

/*
npm init -y
npm install @lezer/html
*/

import { parser as lezerParserHtml } from '@lezer/html';

const inputHtml = `
<div>

  aaa

  bbb

</div>
`;

const htmlParser = lezerParserHtml.configure({
    strict: true, // throw on parse error
    //dialect: "selfClosing",
});

const htmlTree = htmlParser.parse(inputHtml);

const rootNode = htmlTree.topNode;

// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js
/** @param {Tree | TreeNode} tree */
function walkHtmlTree(tree, func) {
    const cursor = tree.cursor();
    //if (!cursor) return '';
    if (!cursor) return;
    let depth = 0;
    while (true) {
        // NLR: Node, Left, Right
        // Node
        // NOTE InvalidEntity breaks the parser
        // <a t="a&amp;b&c">a&amp;b&c</a>
        // -> require valid input, throw on parse error
        const cursorTypeId = cursor.type.id;
        if (
            //true || // debug: dont filter
            !(
                cursorTypeId == 15 || // Document
                cursorTypeId == 20 || // Element
                cursorTypeId == 23 || // Attribute
                cursorTypeId == 21 || // OpenTag <script>
                cursorTypeId == 30 || // OpenTag <style>
                cursorTypeId == 36 || // OpenTag
                cursorTypeId == 32 || // CloseTag </style>
                cursorTypeId == 29 || // CloseTag </script>
                cursorTypeId == 37 || // CloseTag
                cursorTypeId == 38 || // SelfClosingTag
                // note: this is inconsistent in the parser
                // InvalidEntity is child node
                // EntityReference is separate node (sibling of other text nodes)
                cursorTypeId == 19 || // InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity
                //cursorTypeId == 17 || // EntityReference: "&amp;" or "&mdash;" is parsed as EntityReference
                false
            )
        ) {
            func(cursor)
        }
        // Left
        if (cursor.firstChild()) {
            // moved down
            depth++;
            continue;
        }
        // Right
        if (depth > 0 && cursor.nextSibling()) {
            // moved right
            continue;
        }
        let continueMainLoop = false;
        let firstUp = true;
        while (cursor.parent()) {
            // moved up
            depth--;
            if (depth <= 0) {
                // when tree is a node, stop at the end of node
                // == dont visit sibling or parent nodes
                return;
            }
            if (cursor.nextSibling()) {
                // moved up + right
                continueMainLoop = true;
                break;
            }
            firstUp = false;
        }
        if (continueMainLoop) continue;
        break;
    }
}

let lastNodeTo = 0;
const maxLen = 30;
walkHtmlTree(rootNode, (node) => {
    let nodeSource = JSON.stringify(inputHtml.slice(node.from, node.to));
    let spaceNodeSource = JSON.stringify(inputHtml.slice(lastNodeTo, node.to));
    if (nodeSource.length > maxLen) {
        nodeSource = nodeSource.slice(0, maxLen);
    }
    if (spaceNodeSource.length > maxLen) {
        spaceNodeSource = spaceNodeSource.slice(0, maxLen);
    }
    console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);
    lastNodeTo = node.to;
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
1 participant