Skip to content

Commit

Permalink
added support for parsing out custom data elements
Browse files Browse the repository at this point in the history
closes #9
  • Loading branch information
Sergii.Kliuchnyk authored and tmont committed Jun 9, 2016
1 parent 833ac01 commit 263a57c
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 80 deletions.
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,40 @@ console.log(sanitized);
//<p>blah blah</p>
```

### Custom data elements
You can parser custom data elements like php code or underscore templates with `regex.dataElements` config
```javascript
helpers.parseString('<div><?= "<div>$var</div>" ?></div>', {
openElement: function(name) {
console.log(name); // 'div'
},
closeElement: function(name) {
console.log(name); // 'div'
},
phpEcho: function(value) {
console.log(value); // {length: 61, someProperty: ' "<div>$var</div>" '}
}
}, {
dataElements: {
phpEcho: {
start: '<?=',
data: function (string) {
var index = string.indexOf('?>'),
code = string.slice(0, index);

return code;
// or
return {
length: code.length, // required field
someProperty: code
};
},
end: '?>'
}
}
});
```

## API
```javascript
/**
Expand All @@ -105,9 +139,17 @@ console.log(sanitized);
* @param {Object} [regex]
* @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
* @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
* @param {Object.<callbackName,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
*/
parse(htmlString, callbacks, regex)

/**
* @typedef {Object} DataElementConfig
* @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
* @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
* @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
*/

/**
* Parses the HTML contained in the given file asynchronously.
*
Expand Down
51 changes: 39 additions & 12 deletions src/context.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
exports.create = function(raw, options, regex) {
exports.create = function(raw, callbacks, regex) {
var index = 0,
current = null,
substring = null;

var context = {
Expand Down Expand Up @@ -58,7 +59,7 @@ exports.create = function(raw, options, regex) {
};

context.__defineGetter__('current', function() {
return this.isEof() ? '' : this.raw.charAt(this.index);
return this.isEof() ? '' : current === null ? (current = this.raw.charAt(this.index)) : current;
});
context.__defineGetter__('raw', function() {
return raw;
Expand All @@ -71,6 +72,7 @@ exports.create = function(raw, options, regex) {
});
context.__defineSetter__('index', function(value) {
index = value;
current = null;
substring = null;
});
context.__defineGetter__('substring', function() {
Expand All @@ -80,20 +82,45 @@ exports.create = function(raw, options, regex) {
context.callbacks = {};
var types = [ 'openElement', 'closeElement', 'attribute', 'comment', 'cdata', 'text', 'docType', 'xmlProlog', 'closeOpenedElement' ];
types.forEach(function(value) {
context.callbacks[value] = options[value] || function() {
};
context.callbacks[value] = function() {};
});

merge(context.callbacks, callbacks || {});

context.regex = {
name: /[a-zA-Z_][\w:\-\.]*/,
attribute: /[a-zA-Z_][\w:\-\.]*/
};
regex = regex || {};
for (var name in regex) {
if (regex.hasOwnProperty(name)) {
context.regex[name] = regex[name];
attribute: /[a-zA-Z_][\w:\-\.]*/,
dataElements: {
cdata: {
start: '<![CDATA[',
end: ']]>'
},
comment: {
start: '<!--',
end: '-->'
},
docType: {
start: /^<!DOCTYPE /i,
end: '>'
}
}
}
};

merge(context.regex, regex || {});

return context;
};
};

function merge(target, source) {
for (var name in source) {
if (!source.hasOwnProperty(name)) continue;

var value = source[name];

if (target[name] && typeof value === 'object' && value instanceof RegExp === false) {
merge(target[name], value);
} else {
target[name] = value;
}
}
}
183 changes: 115 additions & 68 deletions src/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,34 +94,72 @@ function parseEndElement(context) {
context.readRegex(/.*?(?:>|$)/);
}

function parseCData(context) {
//read "![CDATA["
context.read(8);
function parseDataElement(context, dataElement) {
var start = dataElement.start,
data = dataElement.data,
end = dataElement.end;

switch (typeof start) {
case 'string':
start = start.length;
break;
case 'object':
start = start.exec(context.substring);
start = start[start.length - 1].length;
break;
case 'function':
start = start(context.substring);
break;
}

var match = /^([\s\S]*?)(?:$|]]>)/.exec(context.substring);
var value = match[1];
context.read(match[0].length);
context.callbacks.cdata(value);
}
context.read(start);

switch (typeof data) {
case 'object':
data = data.exec(context.substring);
data = data[data.length - 1];
break;
case 'function':
data = data(context.substring);
break;
case 'undefined':
var index = -1;

switch (typeof end) {
case 'string':
index = context.substring.indexOf(end);
break;
case 'object':
var match = context.substring.match(end);
if (match) {
match = match[match.length - 1];
index = context.substring.indexOf(match);
}
break;
}

function parseComment(context) {
//read "!--"
context.read(3);
data = index > -1 ? context.substring.slice(0, index) : context.substring;
break;
}

var match = /^([\s\S]*?)(?:$|-->)/.exec(context.substring);
var value = match[1];
context.read(match[0].length);
context.callbacks.comment(value);
}
context.read(data.length);

switch (typeof end) {
case 'string':
end = end.length;
break;
case 'object':
end = end.exec(context.substring);
end = end[end.length - 1].length;
break;
case 'function':
end = end(context.substring);
break;
}

function parseDocType(context) {
//read "!doctype"
context.read(8);
context.read(end);

var match = /^\s*([\s\S]*?)(?:$|>)/.exec(context.substring);
var value = match[1];
context.read(match[0].length);
context.callbacks.docType(value);
return data;
}

function parseXmlProlog(context) {
Expand All @@ -144,55 +182,56 @@ function callbackText(context) {
}

function parseNext(context) {
var current = context.current, buffer = current;
if (current == '<') {
buffer += context.read();
if (context.current === '/') {
buffer += context.read();
if (context.regex.name.test(context.current)) {
callbackText(context);
parseEndElement(context);
} else {
//malformed html
context.read();
appendText(buffer, context);
}
} else if (context.current === '!') {
if (/^!\[CDATA\[/.test(context.substring)) {
callbackText(context);
parseCData(context);
} else if (/^!--/.test(context.substring)) {
callbackText(context);
parseComment(context);
} else if (/^!doctype/i.test(context.substring)) {
callbackText(context);
parseDocType(context);
} else {
//malformed html
context.read();
appendText(buffer, context);
}
} else if (context.current === '?') {
if (/^\?xml/.test(context.substring)) {
callbackText(context);
parseXmlProlog(context);
} else {
//malformed xml prolog
context.read();
appendText(buffer, context);
}
} else if (context.regex.name.test(context.current)) {
if (context.current === '<') {
var next = context.substring.charAt(1);
if (next === '/' && context.regex.name.test(context.substring.charAt(2))) {
context.read(2);
callbackText(context);
parseEndElement(context);
return;
} else if (next === '?' && /^<\?xml/.test(context.substring)) {
context.read(1);
callbackText(context);
parseXmlProlog(context);
return;
} else if (context.regex.name.test(next)) {
context.read(1);
callbackText(context);
parseOpenElement(context);
} else {
//malformed html
context.read();
appendText(buffer, context);
return;
}
}

for (var callbackName in context.regex.dataElements) {
if (!context.regex.dataElements.hasOwnProperty(callbackName)) {
continue;
}

var dataElement = context.regex.dataElements[callbackName],
start = dataElement.start,
isValid = false;

switch (typeof start) {
case 'string':
isValid = context.substring.slice(0, start.length) === start;
break;
case 'object':
isValid = start.test(context.substring);
break;
case 'function':
isValid = start(context.substring) > -1;
break;
}

if (isValid) {
callbackText(context);
context.callbacks[callbackName](parseDataElement(context, dataElement));
return;
}
} else {
appendText(context.current, context);
context.read();
}

appendText(context.current, context);
context.read();
}

/**
Expand All @@ -215,6 +254,7 @@ function parseNext(context) {
* @param {Object} [regex]
* @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
* @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
* @param {Object.<string,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
*/
exports.parse = function(htmlString, callbacks, regex) {
htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
Expand All @@ -226,6 +266,13 @@ exports.parse = function(htmlString, callbacks, regex) {
callbackText(context);
};

/**
* @typedef {Object} DataElementConfig
* @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
* @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
* @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
*/

/**
* Parses the HTML contained in the given file asynchronously.
*
Expand Down
Loading

0 comments on commit 263a57c

Please sign in to comment.