From 48637e24882afb667528769267d4452ca5aa144c Mon Sep 17 00:00:00 2001 From: Tobias Nickel Date: Tue, 19 Jan 2021 20:26:18 +0800 Subject: [PATCH] version 4.0.1, fixed types, hr-closingtag, keepWhitespace option --- README.md | 14 ++++++++++---- package.json | 3 ++- tXml.d.ts | 10 ++++++---- tXml.js | 17 ++++++++++++----- tXml.min.js | 20 +++++++++++--------- test.js | 15 +++++++++++++-- test/examples/wordpad.docx.document.xml | 1 + 7 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 test/examples/wordpad.docx.document.xml diff --git a/README.md b/README.md index aed249a..6115083 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ so, there are good reasons to give tXml.js a try. ## Try Online -Try without installing online: http://tnickel.de/2017/04/02/txml-online +Try without installing online: https://tnickel.de/2017/04/02/txml-online ## new in version 4 - improved support for CDATA @@ -69,6 +69,8 @@ and then in your script you require it by `const txml = require('txml');` or in - **filter** a method, to filter for interesting nodes, use it like Array.filter. - **simplify** to simplify the object, to an easier access. - **pos** where to start parsing. + - **keepComments** if you want to keep comments in your data (keeped as string including ``) (default false) + - **keepWhitespace** keep whitespaces like spaces, tabs and line breaks as string content (default false) - **noChildNodes** array of nodes, that have no children and don't need to be closed. Default is working good for html. For example when parsing rss, the link tag is used to really provide an URL that the user can open. In html however a link text is used to bind css or other resource into the document. In HTML it does not need to get closed. so by default the noChildNodes containes the tagName 'link'. Same as 'img', 'br', 'input', 'meta', 'link'. That means: when parsing rss, it makes to set `noChildNodes` to [], an empty array. ```js txml.parse(` @@ -178,13 +180,17 @@ for await(let element of xmlStream) { // your logic here ... } ``` -The transform stream is great, because when your logic within the processing loop is slow, the file read stream will also run slower, and not fill up the RAM memory. For a more detailed explanation read [here](http://tnickel.de/2019/10/15/2019-10-for-async-on-nodejs-streams/) - +The transform stream is great, because when your logic within the processing loop is slow, the file read stream will also run slower, and not fill up the RAM memory. For a more detailed explanation read [here](https://tnickel.de/2019/10/15/2019-10-for-async-on-nodejs-streams/) +## Changelog + - version 4.0.1 + - fixed children type definition not to include number (issue #20) + - add `hr` to self closing tags + - new parser option `keepWhitespace` (issue #21) ## Developer ![Tobias Nickel](https://avatars1.githubusercontent.com/u/4189801?s=150) -[Tobias Nickel](http://tnickel.de/) German software developer in Shanghai. +[Tobias Nickel](https://tnickel.de/) German software developer in Shanghai. diff --git a/package.json b/package.json index beead77..0d5c886 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "txml", - "version": "4.0.0", + "version": "4.0.1", "description": "fastest XML DOM Parser for node/browser/worker", "main": "tXml.js", "scripts": { @@ -26,6 +26,7 @@ "bugs": { "url": "https://github.com/TobiasNickel/tXml/issues" }, + "types":"tXml.d.ts", "homepage": "https://github.com/TobiasNickel/tXml#readme", "dependencies": { "through2": "^3.0.1" diff --git a/tXml.d.ts b/tXml.d.ts index d00e482..8b53bfb 100644 --- a/tXml.d.ts +++ b/tXml.d.ts @@ -1,13 +1,14 @@ export type tNode = { tagName: string; attributes: object; - children: tNode | string | number[]; + children: (tNode | string)[]; }; export type TParseOptions = { pos?: number; noChildNodes?: string[]; setPos?: boolean; keepComments?: boolean; + keepWhitespace?: boolean; simplify?: boolean; filter?: (a: tNode, b: tNode) => boolean; }; @@ -20,7 +21,7 @@ export type TParseOptions = { * @typedef tNode * @property {string} tagName * @property {object} attributes - * @property {tNode|string|number[]} children + * @property {(tNode|string)[]} children **/ /** * @typedef TParseOptions @@ -28,6 +29,7 @@ export type TParseOptions = { * @property {string[]} [noChildNodes] * @property {boolean} [setPos] * @property {boolean} [keepComments] + * @property {boolean} [keepWhitespace] * @property {boolean} [simplify] * @property {(a: tNode, b: tNode) => boolean} [filter] */ @@ -35,9 +37,9 @@ export type TParseOptions = { * parseXML / html into a DOM Object. with no validation and some failur tolerance * @param {string} S your XML to parse * @param {TParseOptions} [options] all other options: - * @return {(tNode | string | number)[]} + * @return {(tNode | string)[]} */ -export function parse(S: string, options?: TParseOptions): (tNode | string | number)[]; +export function parse(S: string, options?: TParseOptions): (tNode | string)[]; /** * transform the DomObject to an object that is like the object of PHP`s simple_xmp_load_*() methods. * this format helps you to write that is more likely to keep your program working, even if there a small changes in the XML schema. diff --git a/tXml.js b/tXml.js index 0001e9e..c639d8e 100644 --- a/tXml.js +++ b/tXml.js @@ -24,7 +24,7 @@ module.exports = { * @typedef tNode * @property {string} tagName * @property {object} attributes - * @property {tNode|string|number[]} children + * @property {(tNode|string)[]} children **/ /** @@ -33,6 +33,7 @@ module.exports = { * @property {string[]} [noChildNodes] * @property {boolean} [setPos] * @property {boolean} [keepComments] + * @property {boolean} [keepWhitespace] * @property {boolean} [simplify] * @property {(a: tNode, b: tNode) => boolean} [filter] */ @@ -41,13 +42,15 @@ module.exports = { * parseXML / html into a DOM Object. with no validation and some failur tolerance * @param {string} S your XML to parse * @param {TParseOptions} [options] all other options: - * @return {(tNode | string | number)[]} + * @return {(tNode | string)[]} */ function parse(S, options) { "use strict"; options = options || {}; var pos = options.pos || 0; + var keepComments = !!options.keepComments; + var keepWhitespace = !!options.keepWhitespace var openBracket = "<"; var openBracketCC = "<".charCodeAt(0); @@ -96,7 +99,7 @@ function parse(S, options) { if (pos === -1) { pos = S.length } - if (options.keepComments === true) { + if (keepComments) { children.push(S.substring(startCommentPos, pos + 1)); } } else if ( @@ -140,7 +143,7 @@ function parse(S, options) { } } else { var text = parseText() - if (text.trim().length > 0) + if (keepWhitespace || text.trim().length > 0) children.push(text); pos++; } @@ -174,7 +177,7 @@ function parse(S, options) { * is parsing a node, including tagName, Attributes and its children, * to parse children it uses the parseChildren again, that makes the parsing recursive */ - var NoChildNodes = options.noChildNodes || ['img', 'br', 'input', 'meta', 'link']; + var NoChildNodes = options.noChildNodes || ['img', 'br', 'input', 'meta', 'link', 'hr']; function parseNode() { pos++; @@ -421,6 +424,10 @@ function stringify(O) { out += ' ' + i + "='" + N.attributes[i].trim() + "'"; } } + if(N.tagName[0]==='?'){ + out += '?>'; + return; + } out += '>'; writeChildren(N.children); out += ''; diff --git a/tXml.min.js b/tXml.min.js index 9018b57..d089470 100644 --- a/tXml.min.js +++ b/tXml.min.js @@ -1,9 +1,11 @@ -var txml=function(){function q(b,c){function d(h){for(var m=[];b[a];)if(60==b.charCodeAt(a)){if(47===b.charCodeAt(a+1)){var f=a+2;a=b.indexOf(">",a);if(-1==b.substring(f,a).indexOf(h))throw h=b.substring(0,a).split("\n"),Error("Unexpected close tag\nLine: "+(h.length-1)+"\nColumn: "+(h[h.length-1].length+1)+"\nChar: "+b[a]);a+1&&(a+=1);break}else if(33===b.charCodeAt(a+1)){if(45==b.charCodeAt(a+2)){for(f=a;-1!==a&&(62!==b.charCodeAt(a)||45!=b.charCodeAt(a-1)||45!=b.charCodeAt(a-2)||-1==a);)a=b.indexOf(">", -a+1);-1===a&&(a=b.length);!0===c.keepComments&&m.push(b.substring(f,a+1))}else if(91===b.charCodeAt(a+2)&&91===b.charCodeAt(a+8)&&"cdata"===b.substr(a+3,5).toLowerCase()){f=b.indexOf("]]\x3e",a);-1==f?(m.push(b.substr(a+9)),a=b.length):(m.push(b.substring(a+9,f)),a=f+3);continue}else{f=a+1;a+=2;for(var n=!1;(62!==b.charCodeAt(a)||!0===n)&&b[a];)91===b.charCodeAt(a)?n=!0:!0===n&&93===b.charCodeAt(a)&&(n=!1),a++;m.push(b.substring(f,a))}a++;continue}f=g();m.push(f);"?"===f.tagName[0]&&(m.push.apply(m, -f.children),f.children=[])}else f=a,a=b.indexOf("<",a)-1,-2===a&&(a=b.length),f=b.slice(f,a+1),0n||96n){n=e();for(var k=b.charCodeAt(a);k&&39!==k&&34!==k&&!(64k||96k)&&62!==k;)a++,k=b.charCodeAt(a);if(39===k||34===k){if(k=a+1,a=b.indexOf(b[a],k),k=b.slice(k, -a),-1===a)return{tagName:h,attributes:m,children:f}}else k=null,a--;m[n]=k}a++}47!==b.charCodeAt(a-1)?"script"==h?(f=a+1,a=b.indexOf("\x3c/script>",a),f=[b.slice(f,a)],a+=9):"style"==h?(f=a+1,a=b.indexOf("",a),f=[b.slice(f,a)],a+=8):-1===x.indexOf(h)?(a++,f=d(h)):a++:a++;return{tagName:h,attributes:m,children:f}}function l(){var h=(new RegExp("\\s"+c.attrName+"\\s*=['\"]"+c.attrValue+"['\"]")).exec(b);return h?h.index:-1}c=c||{};var a=c.pos||0,r="\r\n\t>/= ",x=c.noChildNodes||["img","br", -"input","meta","link"],p=null;if(void 0!==c.attrValue)for(c.attrName=c.attrName||"id",p=[];-1!==(a=l());)a=b.lastIndexOf("<",a),-1!==a&&p.push(g()),b=b.substr(a),a=0;else p=c.parseNode?g():d("");c.filter&&(p=t(p,c.filter));c.setPos&&(p.pos=a);return p}function v(b){var c={};if(!b.length)return"";if(1===b.length&&"string"==typeof b[0])return b[0];b.forEach(function(e){if("object"===typeof e){c[e.tagName]||(c[e.tagName]=[]);var g=v(e.children);c[e.tagName].push(g);Object.keys(e.attributes).length&& -(g._attributes=e.attributes)}});for(var d in c)1==c[d].length&&(c[d]=c[d][0]);return c}function w(b,c){c=void 0===c?{}:c;var d={};if(!b.length)return d;if(1===b.length&&"string"==typeof b[0])return Object.keys(c).length?{_attributes:c,value:b[0]}:b[0];b.forEach(function(e){if("object"===typeof e){d[e.tagName]||(d[e.tagName]=[]);var g=w(e.children||[],e.attributes);d[e.tagName].push(g);Object.keys(e.attributes).length&&(g._attributes=e.attributes)}});return d}function t(b,c,d,e){d=void 0===d?0:d;e= -void 0===e?"":e;var g=[];b.forEach(function(l,a){"object"===typeof l&&c(l,a,d,e)&&g.push(l);if(l.children){var r=t(l.children,c,d+1,(e?e+".":"")+a+"."+l.tagName);g=g.concat(r)}});return g}function u(b){if(Array.isArray(b)){var c="";b.forEach(function(d){c+=" "+u(d);c=c.trim()});return c}return"object"===typeof b?u(b.children):" "+b}return{parse:q,simplify:v,simplifyLostLess:w,filter:t,stringify:function(b){function c(e){if(e)for(var g=0;g"}}var d="";c(b);return d},toContentString:u,getElementById:function(b,c,d){b=q(b,{attrValue:c});return d?tXml.simplify(b):b[0]},getElementsByClassName:function(b,c,d){b=q(b,{attrName:"class",attrValue:"[a-zA-Z0-9- ]*"+c+"[a-zA-Z0-9- ]*"});return d?tXml.simplify(b): -b}}}(); \ No newline at end of file +var $jscomp=$jscomp||{};$jscomp.scope={};$jscomp.arrayIteratorImpl=function(h){var p=0;return function(){return p",a);if(-1==b.substring(f,a).indexOf(k))throw k=b.substring(0,a).split("\n"),Error("Unexpected close tag\nLine: "+(k.length-1)+"\nColumn: "+(k[k.length-1].length+1)+"\nChar: "+b[a]);a+1&&(a+=1);break}else if(33===b.charCodeAt(a+1)){if(45==b.charCodeAt(a+2)){for(f=a;-1!==a&&(62!==b.charCodeAt(a)||45!=b.charCodeAt(a-1)||45!=b.charCodeAt(a-2)||-1==a);)a=b.indexOf(">", +a+1);-1===a&&(a=b.length);u&&n.push(b.substring(f,a+1))}else if(91===b.charCodeAt(a+2)&&91===b.charCodeAt(a+8)&&"cdata"===b.substr(a+3,5).toLowerCase()){f=b.indexOf("]]\x3e",a);-1==f?(n.push(b.substr(a+9)),a=b.length):(n.push(b.substring(a+9,f)),a=f+3);continue}else{f=a+1;a+=2;for(var q=!1;(62!==b.charCodeAt(a)||!0===q)&&b[a];)91===b.charCodeAt(a)?q=!0:!0===q&&93===b.charCodeAt(a)&&(q=!1),a++;n.push(b.substring(f,a))}a++;continue}f=g();n.push(f);"?"===f.tagName[0]&&(n.push.apply(n,$jscomp.arrayFromIterable(f.children)), +f.children=[])}else f=a,a=b.indexOf("<",a)-1,-2===a&&(a=b.length),f=b.slice(f,a+1),(x||0q||96q){q=e();for(var l=b.charCodeAt(a);l&&39!==l&&34!==l&&!(64l||96l)&&62!==l;)a++,l=b.charCodeAt(a);if(39===l||34===l){if(l=a+1,a=b.indexOf(b[a],l),l=b.slice(l,a),-1=== +a)return{tagName:k,attributes:n,children:f}}else l=null,a--;n[q]=l}a++}47!==b.charCodeAt(a-1)?"script"==k?(f=a+1,a=b.indexOf("\x3c/script>",a),f=[b.slice(f,a)],a+=9):"style"==k?(f=a+1,a=b.indexOf("",a),f=[b.slice(f,a)],a+=8):-1===z.indexOf(k)?(a++,f=d(k)):a++:a++;return{tagName:k,attributes:n,children:f}}function m(){var k=(new RegExp("\\s"+c.attrName+"\\s*=['\"]"+c.attrValue+"['\"]")).exec(b);return k?k.index:-1}c=c||{};var a=c.pos||0,u=!!c.keepComments,x=!!c.keepWhitespace,y="\r\n\t>/= ", +z=c.noChildNodes||"img br input meta link hr".split(" "),r=null;if(void 0!==c.attrValue)for(c.attrName=c.attrName||"id",r=[];-1!==(a=m());)a=b.lastIndexOf("<",a),-1!==a&&r.push(g()),b=b.substr(a),a=0;else r=c.parseNode?g():d("");c.filter&&(r=v(r,c.filter));if(c.simplify)return p(Array.isArray(r)?r:[r]);c.setPos&&(r.pos=a);return r}function p(b){var c={};if(!b.length)return"";if(1===b.length&&"string"==typeof b[0])return b[0];b.forEach(function(e){if("object"===typeof e){c[e.tagName]||(c[e.tagName]= +[]);var g=p(e.children);c[e.tagName].push(g);Object.keys(e.attributes).length&&(g._attributes=e.attributes)}});for(var d in c)1==c[d].length&&(c[d]=c[d][0]);return c}function t(b,c){c=void 0===c?{}:c;var d={};if(!b.length)return d;if(1===b.length&&"string"==typeof b[0])return Object.keys(c).length?{_attributes:c,value:b[0]}:b[0];b.forEach(function(e){if("object"===typeof e){d[e.tagName]||(d[e.tagName]=[]);var g=t(e.children||[],e.attributes);d[e.tagName].push(g);Object.keys(e.attributes).length&& +(g._attributes=e.attributes)}});return d}function v(b,c,d,e){d=void 0===d?0:d;e=void 0===e?"":e;var g=[];b.forEach(function(m,a){"object"===typeof m&&c(m,a,d,e)&&g.push(m);if(m.children){var u=v(m.children,c,d+1,(e?e+".":"")+a+"."+m.tagName);g=g.concat(u)}});return g}function w(b){if(Array.isArray(b)){var c="";b.forEach(function(d){c+=" "+w(d);c=c.trim()});return c}return"object"===typeof b?w(b.children):" "+b}return{parse:h,simplify:p,simplifyLostLess:t,filter:v,stringify:function(b){function c(e){if(e)for(var g= +0;g":(d+=">",c(a.children),d+="")}}var d="";c(b);return d},toContentString:w,getElementById:function(b,c,d){b=h(b,{attrValue:c});return d?tXml.simplify(b):b[0]},getElementsByClassName:function(b, +c,d){b=h(b,{attrName:"class",attrValue:"[a-zA-Z0-9- ]*"+c+"[a-zA-Z0-9- ]*"});return d?tXml.simplify(b):b}}}(); \ No newline at end of file diff --git a/test.js b/test.js index b7f46cc..208928e 100644 --- a/test.js +++ b/test.js @@ -6,7 +6,8 @@ const files = { commented: __dirname + '/test/examples/commented.svg', commentOnly: __dirname + '/test/examples/commentOnly.svg', twoComments: __dirname + '/test/examples/twocomments.svg', - tagesschauRSS: '/test/examples/tagesschau.rss', + tagesschauRSS: __dirname + '/test/examples/tagesschau.rss', + wordpadDocxDocument: __dirname+'/test/examples/wordpad.docx.document.xml', }; assert(tXml, 'tXml is available'); @@ -132,7 +133,7 @@ assert.deepStrictEqual(x, xShould, 'find elements by class') // re-stringify an attribute without value var s = ""; -assert(tXml.stringify(tXml.parse(s)) === s, 'problem with attribute without value'); +assert.deepStrictEqual(tXml.stringify(tXml.parse(s)), s, 'problem with attribute without value'); assert(tXml.stringify(undefined) === '', 'stringify ignore null values'); assert(tXml.toContentString(tXml.parse('fff')) === "f f f") @@ -275,6 +276,16 @@ assert.deepStrictEqual(tXml.simplifyLostLess(['1',2]), {}, 'ignore non objects') assert.deepStrictEqual(tXml.filter([{}],()=>true), [{}], 'allow nodes without children') +const wordpadDoc = fs.readFileSync(files.wordpadDocxDocument).toString(); +assert.deepStrictEqual( + tXml.filter( + tXml.parse(wordpadDoc, { keepWhitespace: true }), + (n) => n.tagName === 'w:t' + )[1].children[0], + ' ' +); + + // https://github.com/TobiasNickel/tXml/issues/14 testAsync().catch(err=>console.log(err)); async function testAsync(){ diff --git a/test/examples/wordpad.docx.document.xml b/test/examples/wordpad.docx.document.xml new file mode 100644 index 0000000..9b204a7 --- /dev/null +++ b/test/examples/wordpad.docx.document.xml @@ -0,0 +1 @@ +- \ No newline at end of file