You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#!/usr/bin/env node
/*npm init -ynpm install @lezer/html*/import{parseraslezerParserHtml}from'@lezer/html';constinputHtml=`<div> aaa bbb</div>`;consthtmlParser=lezerParserHtml.configure({strict: true,// throw on parse error//dialect: "selfClosing",});consthtmlTree=htmlParser.parse(inputHtml);constrootNode=htmlTree.topNode;// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js/** @param {Tree | TreeNode} tree */functionwalkHtmlTree(tree,func){constcursor=tree.cursor();//if (!cursor) return '';if(!cursor)return;letdepth=0;while(true){// NLR: Node, Left, Right// Node// NOTE InvalidEntity breaks the parser// <a t="a&b&c">a&b&c</a>// -> require valid input, throw on parse errorconstcursorTypeId=cursor.type.id;if(//true || // debug: dont filter!(cursorTypeId==15||// DocumentcursorTypeId==20||// ElementcursorTypeId==23||// AttributecursorTypeId==21||// OpenTag <script>cursorTypeId==30||// OpenTag <style>cursorTypeId==36||// OpenTagcursorTypeId==32||// CloseTag </style>cursorTypeId==29||// CloseTag </script>cursorTypeId==37||// CloseTagcursorTypeId==38||// SelfClosingTag// note: this is inconsistent in the parser// InvalidEntity is child node// EntityReference is separate node (sibling of other text nodes)cursorTypeId==19||// InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity//cursorTypeId == 17 || // EntityReference: "&" or "—" is parsed as EntityReferencefalse)){func(cursor)}// Leftif(cursor.firstChild()){// moved downdepth++;continue;}// Rightif(depth>0&&cursor.nextSibling()){// moved rightcontinue;}letcontinueMainLoop=false;letfirstUp=true;while(cursor.parent()){// moved updepth--;if(depth<=0){// when tree is a node, stop at the end of node// == dont visit sibling or parent nodesreturn;}if(cursor.nextSibling()){// moved up + rightcontinueMainLoop=true;break;}firstUp=false;}if(continueMainLoop)continue;break;}}letlastNodeTo=0;constmaxLen=30;walkHtmlTree(rootNode,(node)=>{letnodeSource=JSON.stringify(inputHtml.slice(node.from,node.to));letspaceNodeSource=JSON.stringify(inputHtml.slice(lastNodeTo,node.to));if(nodeSource.length>maxLen){nodeSource=nodeSource.slice(0,maxLen);}if(spaceNodeSource.length>maxLen){spaceNodeSource=spaceNodeSource.slice(0,maxLen);}console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);lastNodeTo=node.to;});
The text was updated successfully, but these errors were encountered:
whitespace is significant for lossless AST transformers
when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node
similar #40
test.html
the right source column has lookbehind source plus
node.text
which is easy to do with
node.range.end_byte
of the previous nodelookahead would be more complex...
tree-sitter-html
lezer-parser-html
note how both source columns are identical
so this is a truly "lossless" parser (CST parser)
diff
repro.py
repro.js
The text was updated successfully, but these errors were encountered: