From 263a57c9382c2cd1dd5ba334a56f5b2b6ffd58bd Mon Sep 17 00:00:00 2001 From: "Sergii.Kliuchnyk" Date: Thu, 9 Jun 2016 00:21:58 -0700 Subject: [PATCH] added support for parsing out custom data elements closes #9 --- README.md | 42 +++++++++ src/context.js | 51 ++++++++--- src/parser.js | 183 +++++++++++++++++++++++-------------- tests/dataelement-tests.js | 95 +++++++++++++++++++ 4 files changed, 291 insertions(+), 80 deletions(-) create mode 100644 tests/dataelement-tests.js diff --git a/README.md b/README.md index 18bc337..6e1d68c 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,40 @@ console.log(sanitized); //

blah blah

``` +### Custom data elements +You can parser custom data elements like php code or underscore templates with `regex.dataElements` config +```javascript +helpers.parseString('
$var
" ?>', { + openElement: function(name) { + console.log(name); // 'div' + }, + closeElement: function(name) { + console.log(name); // 'div' + }, + phpEcho: function(value) { + console.log(value); // {length: 61, someProperty: ' "
$var
" '} + } +}, { + dataElements: { + phpEcho: { + start: ''), + code = string.slice(0, index); + + return code; + // or + return { + length: code.length, // required field + someProperty: code + }; + }, + end: '?>' + } + } +}); +``` + ## API ```javascript /** @@ -105,9 +139,17 @@ console.log(sanitized); * @param {Object} [regex] * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]* * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]* + * @param {Object.} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements */ parse(htmlString, callbacks, regex) +/** + * @typedef {Object} DataElementConfig + * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;} + * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];} + * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;} + */ + /** * Parses the HTML contained in the given file asynchronously. * diff --git a/src/context.js b/src/context.js index b9c27c2..4ad2e6b 100644 --- a/src/context.js +++ b/src/context.js @@ -1,5 +1,6 @@ -exports.create = function(raw, options, regex) { +exports.create = function(raw, callbacks, regex) { var index = 0, + current = null, substring = null; var context = { @@ -58,7 +59,7 @@ exports.create = function(raw, options, regex) { }; context.__defineGetter__('current', function() { - return this.isEof() ? '' : this.raw.charAt(this.index); + return this.isEof() ? '' : current === null ? (current = this.raw.charAt(this.index)) : current; }); context.__defineGetter__('raw', function() { return raw; @@ -71,6 +72,7 @@ exports.create = function(raw, options, regex) { }); context.__defineSetter__('index', function(value) { index = value; + current = null; substring = null; }); context.__defineGetter__('substring', function() { @@ -80,20 +82,45 @@ exports.create = function(raw, options, regex) { context.callbacks = {}; var types = [ 'openElement', 'closeElement', 'attribute', 'comment', 'cdata', 'text', 'docType', 'xmlProlog', 'closeOpenedElement' ]; types.forEach(function(value) { - context.callbacks[value] = options[value] || function() { - }; + context.callbacks[value] = function() {}; }); + merge(context.callbacks, callbacks || {}); + context.regex = { name: /[a-zA-Z_][\w:\-\.]*/, - attribute: /[a-zA-Z_][\w:\-\.]*/ - }; - regex = regex || {}; - for (var name in regex) { - if (regex.hasOwnProperty(name)) { - context.regex[name] = regex[name]; + attribute: /[a-zA-Z_][\w:\-\.]*/, + dataElements: { + cdata: { + start: '' + }, + comment: { + start: '' + }, + docType: { + start: /^' + } } - } + }; + + merge(context.regex, regex || {}); return context; -}; \ No newline at end of file +}; + +function merge(target, source) { + for (var name in source) { + if (!source.hasOwnProperty(name)) continue; + + var value = source[name]; + + if (target[name] && typeof value === 'object' && value instanceof RegExp === false) { + merge(target[name], value); + } else { + target[name] = value; + } + } +} \ No newline at end of file diff --git a/src/parser.js b/src/parser.js index 4c4a620..dc81184 100644 --- a/src/parser.js +++ b/src/parser.js @@ -94,34 +94,72 @@ function parseEndElement(context) { context.readRegex(/.*?(?:>|$)/); } -function parseCData(context) { - //read "![CDATA[" - context.read(8); +function parseDataElement(context, dataElement) { + var start = dataElement.start, + data = dataElement.data, + end = dataElement.end; + + switch (typeof start) { + case 'string': + start = start.length; + break; + case 'object': + start = start.exec(context.substring); + start = start[start.length - 1].length; + break; + case 'function': + start = start(context.substring); + break; + } - var match = /^([\s\S]*?)(?:$|]]>)/.exec(context.substring); - var value = match[1]; - context.read(match[0].length); - context.callbacks.cdata(value); -} + context.read(start); + + switch (typeof data) { + case 'object': + data = data.exec(context.substring); + data = data[data.length - 1]; + break; + case 'function': + data = data(context.substring); + break; + case 'undefined': + var index = -1; + + switch (typeof end) { + case 'string': + index = context.substring.indexOf(end); + break; + case 'object': + var match = context.substring.match(end); + if (match) { + match = match[match.length - 1]; + index = context.substring.indexOf(match); + } + break; + } -function parseComment(context) { - //read "!--" - context.read(3); + data = index > -1 ? context.substring.slice(0, index) : context.substring; + break; + } - var match = /^([\s\S]*?)(?:$|-->)/.exec(context.substring); - var value = match[1]; - context.read(match[0].length); - context.callbacks.comment(value); -} + context.read(data.length); + + switch (typeof end) { + case 'string': + end = end.length; + break; + case 'object': + end = end.exec(context.substring); + end = end[end.length - 1].length; + break; + case 'function': + end = end(context.substring); + break; + } -function parseDocType(context) { - //read "!doctype" - context.read(8); + context.read(end); - var match = /^\s*([\s\S]*?)(?:$|>)/.exec(context.substring); - var value = match[1]; - context.read(match[0].length); - context.callbacks.docType(value); + return data; } function parseXmlProlog(context) { @@ -144,55 +182,56 @@ function callbackText(context) { } function parseNext(context) { - var current = context.current, buffer = current; - if (current == '<') { - buffer += context.read(); - if (context.current === '/') { - buffer += context.read(); - if (context.regex.name.test(context.current)) { - callbackText(context); - parseEndElement(context); - } else { - //malformed html - context.read(); - appendText(buffer, context); - } - } else if (context.current === '!') { - if (/^!\[CDATA\[/.test(context.substring)) { - callbackText(context); - parseCData(context); - } else if (/^!--/.test(context.substring)) { - callbackText(context); - parseComment(context); - } else if (/^!doctype/i.test(context.substring)) { - callbackText(context); - parseDocType(context); - } else { - //malformed html - context.read(); - appendText(buffer, context); - } - } else if (context.current === '?') { - if (/^\?xml/.test(context.substring)) { - callbackText(context); - parseXmlProlog(context); - } else { - //malformed xml prolog - context.read(); - appendText(buffer, context); - } - } else if (context.regex.name.test(context.current)) { + if (context.current === '<') { + var next = context.substring.charAt(1); + if (next === '/' && context.regex.name.test(context.substring.charAt(2))) { + context.read(2); + callbackText(context); + parseEndElement(context); + return; + } else if (next === '?' && /^<\?xml/.test(context.substring)) { + context.read(1); + callbackText(context); + parseXmlProlog(context); + return; + } else if (context.regex.name.test(next)) { + context.read(1); callbackText(context); parseOpenElement(context); - } else { - //malformed html - context.read(); - appendText(buffer, context); + return; + } + } + + for (var callbackName in context.regex.dataElements) { + if (!context.regex.dataElements.hasOwnProperty(callbackName)) { + continue; + } + + var dataElement = context.regex.dataElements[callbackName], + start = dataElement.start, + isValid = false; + + switch (typeof start) { + case 'string': + isValid = context.substring.slice(0, start.length) === start; + break; + case 'object': + isValid = start.test(context.substring); + break; + case 'function': + isValid = start(context.substring) > -1; + break; + } + + if (isValid) { + callbackText(context); + context.callbacks[callbackName](parseDataElement(context, dataElement)); + return; } - } else { - appendText(context.current, context); - context.read(); } + + appendText(context.current, context); + context.read(); } /** @@ -215,6 +254,7 @@ function parseNext(context) { * @param {Object} [regex] * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]* * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]* + * @param {Object.} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements */ exports.parse = function(htmlString, callbacks, regex) { htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); @@ -226,6 +266,13 @@ exports.parse = function(htmlString, callbacks, regex) { callbackText(context); }; +/** + * @typedef {Object} DataElementConfig + * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;} + * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];} + * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;} + */ + /** * Parses the HTML contained in the given file asynchronously. * diff --git a/tests/dataelement-tests.js b/tests/dataelement-tests.js new file mode 100644 index 0000000..6de9bca --- /dev/null +++ b/tests/dataelement-tests.js @@ -0,0 +1,95 @@ +var should = require('should'); +var helpers = require('./helpers'); + +describe('dataElement', function () { + it('as string', function() { + var dataCount = 0; + helpers.parseString('" ?>', { + php: function(value) { + value.should.equal(' echo "" '); + dataCount++; + } + }, { + dataElements: { + php: { + start: '' + } + } + }); + + dataCount.should.equal(1); + }); + + it('as regex', function() { + var dataCount = 0, openCount = 0, closeCount = 0; + helpers.parseString('$var" ?>', { + openElement: function(name) { + name.should.equal('foo'); + openCount++; + }, + closeElement: function(name) { + name.should.equal('foo'); + closeCount++; + }, + phpEcho: function(value) { + openCount.should.equal(1); + closeCount.should.equal(0); + value.should.equal(' "
$var
" '); + dataCount++; + } + }, { + dataElements: { + phpEcho: { + start: /^<\?=/, + end: /\?>/ + } + } + }); + + dataCount.should.equal(1); + openCount.should.equal(1); + closeCount.should.equal(1); + }); + + it('as function', function() { + var dataCount = 0; + helpers.parseString('', { + comment: function (value) { + value.should.equal(' test '); + dataCount++; + }, + php: function(value) { + value.should.deepEqual({ + value: '\nfoo\n', + length: '\nfoo\n'.length + }); + dataCount++; + } + }, { + dataElements: { + php: { + start: function (substring) { + return substring.slice(0, 5) === ''); + dataCount++; + var index = substring.indexOf('?>'); + return { + value: substring.slice(0, index), + length: substring.slice(0, index).length + }; + }, + end: function (substring) { + substring.should.equal('?>'); + dataCount++; + return 2; + } + } + } + }); + + dataCount.should.equal(4); + }); +}); \ No newline at end of file