diff --git a/README.md b/README.md
index 18bc337..6e1d68c 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,40 @@ console.log(sanitized);
blah blah
+### Custom data elements
+You can parser custom data elements like php code or underscore templates with `regex.dataElements` config
+helpers.parseString('', {
+ openElement: function(name) {
+ console.log(name); // 'div'
+ },
+ closeElement: function(name) {
+ console.log(name); // 'div'
+ },
+ phpEcho: function(value) {
+ console.log(value); // {length: 61, someProperty: ' "$var
" '}
+ }
+}, {
+ dataElements: {
+ phpEcho: {
+ start: '=',
+ data: function (string) {
+ var index = string.indexOf('?>'),
+ code = string.slice(0, index);
+ return code;
+ // or
+ return {
+ length: code.length, // required field
+ someProperty: code
+ };
+ },
+ end: '?>'
+ }
+ }
## API
@@ -105,9 +139,17 @@ console.log(sanitized);
* @param {Object} [regex]
* @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
* @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
+ * @param {Object.} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
parse(htmlString, callbacks, regex)
+ * @typedef {Object} DataElementConfig
+ * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
+ * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
+ * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
+ */
* Parses the HTML contained in the given file asynchronously.
diff --git a/src/context.js b/src/context.js
index b9c27c2..4ad2e6b 100644
--- a/src/context.js
+++ b/src/context.js
@@ -1,5 +1,6 @@
-exports.create = function(raw, options, regex) {
+exports.create = function(raw, callbacks, regex) {
var index = 0,
+ current = null,
substring = null;
var context = {
@@ -58,7 +59,7 @@ exports.create = function(raw, options, regex) {
context.__defineGetter__('current', function() {
- return this.isEof() ? '' : this.raw.charAt(this.index);
+ return this.isEof() ? '' : current === null ? (current = this.raw.charAt(this.index)) : current;
context.__defineGetter__('raw', function() {
return raw;
@@ -71,6 +72,7 @@ exports.create = function(raw, options, regex) {
context.__defineSetter__('index', function(value) {
index = value;
+ current = null;
substring = null;
context.__defineGetter__('substring', function() {
@@ -80,20 +82,45 @@ exports.create = function(raw, options, regex) {
context.callbacks = {};
var types = [ 'openElement', 'closeElement', 'attribute', 'comment', 'cdata', 'text', 'docType', 'xmlProlog', 'closeOpenedElement' ];
types.forEach(function(value) {
- context.callbacks[value] = options[value] || function() {
- };
+ context.callbacks[value] = function() {};
+ merge(context.callbacks, callbacks || {});
context.regex = {
name: /[a-zA-Z_][\w:\-\.]*/,
- attribute: /[a-zA-Z_][\w:\-\.]*/
- };
- regex = regex || {};
- for (var name in regex) {
- if (regex.hasOwnProperty(name)) {
- context.regex[name] = regex[name];
+ attribute: /[a-zA-Z_][\w:\-\.]*/,
+ dataElements: {
+ cdata: {
+ start: ''
+ },
+ comment: {
+ start: ''
+ },
+ docType: {
+ start: /^'
+ }
- }
+ };
+ merge(context.regex, regex || {});
return context;
\ No newline at end of file
+function merge(target, source) {
+ for (var name in source) {
+ if (!source.hasOwnProperty(name)) continue;
+ var value = source[name];
+ if (target[name] && typeof value === 'object' && value instanceof RegExp === false) {
+ merge(target[name], value);
+ } else {
+ target[name] = value;
+ }
+ }
\ No newline at end of file
diff --git a/src/parser.js b/src/parser.js
index 4c4a620..dc81184 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -94,34 +94,72 @@ function parseEndElement(context) {
-function parseCData(context) {
- //read "![CDATA["
- context.read(8);
+function parseDataElement(context, dataElement) {
+ var start = dataElement.start,
+ data = dataElement.data,
+ end = dataElement.end;
+ switch (typeof start) {
+ case 'string':
+ start = start.length;
+ break;
+ case 'object':
+ start = start.exec(context.substring);
+ start = start[start.length - 1].length;
+ break;
+ case 'function':
+ start = start(context.substring);
+ break;
+ }
- var match = /^([\s\S]*?)(?:$|]]>)/.exec(context.substring);
- var value = match[1];
- context.read(match[0].length);
- context.callbacks.cdata(value);
+ context.read(start);
+ switch (typeof data) {
+ case 'object':
+ data = data.exec(context.substring);
+ data = data[data.length - 1];
+ break;
+ case 'function':
+ data = data(context.substring);
+ break;
+ case 'undefined':
+ var index = -1;
+ switch (typeof end) {
+ case 'string':
+ index = context.substring.indexOf(end);
+ break;
+ case 'object':
+ var match = context.substring.match(end);
+ if (match) {
+ match = match[match.length - 1];
+ index = context.substring.indexOf(match);
+ }
+ break;
+ }
-function parseComment(context) {
- //read "!--"
- context.read(3);
+ data = index > -1 ? context.substring.slice(0, index) : context.substring;
+ break;
+ }
- var match = /^([\s\S]*?)(?:$|-->)/.exec(context.substring);
- var value = match[1];
- context.read(match[0].length);
- context.callbacks.comment(value);
+ context.read(data.length);
+ switch (typeof end) {
+ case 'string':
+ end = end.length;
+ break;
+ case 'object':
+ end = end.exec(context.substring);
+ end = end[end.length - 1].length;
+ break;
+ case 'function':
+ end = end(context.substring);
+ break;
+ }
-function parseDocType(context) {
- //read "!doctype"
- context.read(8);
+ context.read(end);
- var match = /^\s*([\s\S]*?)(?:$|>)/.exec(context.substring);
- var value = match[1];
- context.read(match[0].length);
- context.callbacks.docType(value);
+ return data;
function parseXmlProlog(context) {
@@ -144,55 +182,56 @@ function callbackText(context) {
function parseNext(context) {
- var current = context.current, buffer = current;
- if (current == '<') {
- buffer += context.read();
- if (context.current === '/') {
- buffer += context.read();
- if (context.regex.name.test(context.current)) {
- callbackText(context);
- parseEndElement(context);
- } else {
- //malformed html
- context.read();
- appendText(buffer, context);
- }
- } else if (context.current === '!') {
- if (/^!\[CDATA\[/.test(context.substring)) {
- callbackText(context);
- parseCData(context);
- } else if (/^!--/.test(context.substring)) {
- callbackText(context);
- parseComment(context);
- } else if (/^!doctype/i.test(context.substring)) {
- callbackText(context);
- parseDocType(context);
- } else {
- //malformed html
- context.read();
- appendText(buffer, context);
- }
- } else if (context.current === '?') {
- if (/^\?xml/.test(context.substring)) {
- callbackText(context);
- parseXmlProlog(context);
- } else {
- //malformed xml prolog
- context.read();
- appendText(buffer, context);
- }
- } else if (context.regex.name.test(context.current)) {
+ if (context.current === '<') {
+ var next = context.substring.charAt(1);
+ if (next === '/' && context.regex.name.test(context.substring.charAt(2))) {
+ context.read(2);
+ callbackText(context);
+ parseEndElement(context);
+ return;
+ } else if (next === '?' && /^<\?xml/.test(context.substring)) {
+ context.read(1);
+ callbackText(context);
+ parseXmlProlog(context);
+ return;
+ } else if (context.regex.name.test(next)) {
+ context.read(1);
- } else {
- //malformed html
- context.read();
- appendText(buffer, context);
+ return;
+ }
+ }
+ for (var callbackName in context.regex.dataElements) {
+ if (!context.regex.dataElements.hasOwnProperty(callbackName)) {
+ continue;
+ }
+ var dataElement = context.regex.dataElements[callbackName],
+ start = dataElement.start,
+ isValid = false;
+ switch (typeof start) {
+ case 'string':
+ isValid = context.substring.slice(0, start.length) === start;
+ break;
+ case 'object':
+ isValid = start.test(context.substring);
+ break;
+ case 'function':
+ isValid = start(context.substring) > -1;
+ break;
+ }
+ if (isValid) {
+ callbackText(context);
+ context.callbacks[callbackName](parseDataElement(context, dataElement));
+ return;
- } else {
- appendText(context.current, context);
- context.read();
+ appendText(context.current, context);
+ context.read();
@@ -215,6 +254,7 @@ function parseNext(context) {
* @param {Object} [regex]
* @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
* @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
+ * @param {Object.} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
exports.parse = function(htmlString, callbacks, regex) {
htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
@@ -226,6 +266,13 @@ exports.parse = function(htmlString, callbacks, regex) {
+ * @typedef {Object} DataElementConfig
+ * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
+ * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
+ * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
+ */
* Parses the HTML contained in the given file asynchronously.
diff --git a/tests/dataelement-tests.js b/tests/dataelement-tests.js
new file mode 100644
index 0000000..6de9bca
--- /dev/null
+++ b/tests/dataelement-tests.js
@@ -0,0 +1,95 @@
+var should = require('should');
+var helpers = require('./helpers');
+describe('dataElement', function () {
+ it('as string', function() {
+ var dataCount = 0;
+ helpers.parseString('