From 263a57c9382c2cd1dd5ba334a56f5b2b6ffd58bd Mon Sep 17 00:00:00 2001
From: "Sergii.Kliuchnyk" <sergii.kliuchnyk@masterofcode.com>
Date: Thu, 9 Jun 2016 00:21:58 -0700
Subject: [PATCH] added support for parsing out custom data elements

closes #9
---
 README.md                  |  42 +++++++++
 src/context.js             |  51 ++++++++---
 src/parser.js              | 183 +++++++++++++++++++++++--------------
 tests/dataelement-tests.js |  95 +++++++++++++++++++
 4 files changed, 291 insertions(+), 80 deletions(-)
 create mode 100644 tests/dataelement-tests.js
diff --git a/README.md b/README.md
index 18bc337..6e1d68c 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,40 @@ console.log(sanitized);
 //<p>blah blah</p>
 ```
 
+### Custom data elements
+You can parser custom data elements like php code or underscore templates with `regex.dataElements` config
+```javascript
+helpers.parseString('<div><?= "<div>$var</div>" ?></div>', {
+    openElement: function(name) {
+        console.log(name); // 'div'
+    },
+    closeElement: function(name) {
+        console.log(name); // 'div'
+    },
+    phpEcho: function(value) {
+        console.log(value); // {length: 61, someProperty: ' "<div>$var</div>" '}
+    }
+}, {
+    dataElements: {
+        phpEcho: {
+            start: '<?=',
+            data: function (string) {
+                var index = string.indexOf('?>'),
+                    code = string.slice(0, index);
+
+                return code;
+                // or
+                return {
+                    length: code.length, // required field
+                    someProperty: code
+                };
+            },
+            end: '?>'
+        }
+    }
+});
+```
+
 ## API
 ```javascript
 /**
@@ -105,9 +139,17 @@ console.log(sanitized);
  * @param {Object} [regex]
  * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
  * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
+ * @param {Object.<callbackName,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
  */
 parse(htmlString, callbacks, regex)
 
+/**
+ * @typedef {Object} DataElementConfig
+ * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
+ * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
+ * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
+ */
+
 /**
  * Parses the HTML contained in the given file asynchronously.
  *
diff --git a/src/context.js b/src/context.js
index b9c27c2..4ad2e6b 100644
--- a/src/context.js
+++ b/src/context.js
@@ -1,5 +1,6 @@
-exports.create = function(raw, options, regex) {
+exports.create = function(raw, callbacks, regex) {
 	var index = 0,
+		current = null,
 		substring = null;
 
 	var context = {
@@ -58,7 +59,7 @@ exports.create = function(raw, options, regex) {
 	};
 
 	context.__defineGetter__('current', function() {
-		return this.isEof() ? '' : this.raw.charAt(this.index);
+		return this.isEof() ? '' : current === null ? (current = this.raw.charAt(this.index)) : current;
 	});
 	context.__defineGetter__('raw', function() {
 		return raw;
@@ -71,6 +72,7 @@ exports.create = function(raw, options, regex) {
 	});
 	context.__defineSetter__('index', function(value) {
 		index = value;
+		current = null;
 		substring = null;
 	});
 	context.__defineGetter__('substring', function() {
@@ -80,20 +82,45 @@ exports.create = function(raw, options, regex) {
 	context.callbacks = {};
 	var types = [ 'openElement', 'closeElement', 'attribute', 'comment', 'cdata', 'text', 'docType', 'xmlProlog', 'closeOpenedElement' ];
 	types.forEach(function(value) {
-		context.callbacks[value] = options[value] || function() {
-		};
+		context.callbacks[value] = function() {};
 	});
 
+	merge(context.callbacks, callbacks || {});
+
 	context.regex = {
 		name: /[a-zA-Z_][\w:\-\.]*/,
-		attribute: /[a-zA-Z_][\w:\-\.]*/
-	};
-	regex = regex || {};
-	for (var name in regex) {
-		if (regex.hasOwnProperty(name)) {
-			context.regex[name] = regex[name];
+		attribute: /[a-zA-Z_][\w:\-\.]*/,
+		dataElements: {
+			cdata: {
+				start: '<![CDATA[',
+				end: ']]>'
+			},
+			comment: {
+				start: '<!--',
+				end: '-->'
+			},
+			docType: {
+				start: /^<!DOCTYPE /i,
+				end: '>'
+			}
 		}
-	}
+	};
+
+	merge(context.regex, regex || {});
 
 	return context;
-};
\ No newline at end of file
+};
+
+function merge(target, source) {
+    for (var name in source) {
+		if (!source.hasOwnProperty(name)) continue;
+
+		var value = source[name];
+
+		if (target[name] && typeof value === 'object' && value instanceof RegExp === false) {
+			merge(target[name], value);
+		} else {
+			target[name] = value;
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/parser.js b/src/parser.js
index 4c4a620..dc81184 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -94,34 +94,72 @@ function parseEndElement(context) {
 	context.readRegex(/.*?(?:>|$)/);
 }
 
-function parseCData(context) {
-	//read "![CDATA["
-	context.read(8);
+function parseDataElement(context, dataElement) {
+	var start = dataElement.start,
+		data = dataElement.data,
+		end = dataElement.end;
+
+	switch (typeof start) {
+		case 'string':
+			start = start.length;
+			break;
+		case 'object':
+			start = start.exec(context.substring);
+			start = start[start.length - 1].length;
+			break;
+		case 'function':
+			start = start(context.substring);
+			break;
+	}
 
-	var match = /^([\s\S]*?)(?:$|]]>)/.exec(context.substring);
-	var value = match[1];
-	context.read(match[0].length);
-	context.callbacks.cdata(value);
-}
+	context.read(start);
+
+	switch (typeof data) {
+		case 'object':
+			data = data.exec(context.substring);
+			data = data[data.length - 1];
+			break;
+		case 'function':
+			data = data(context.substring);
+			break;
+		case 'undefined':
+			var index = -1;
+
+			switch (typeof end) {
+				case 'string':
+					index = context.substring.indexOf(end);
+					break;
+				case 'object':
+					var match = context.substring.match(end);
+					if (match) {
+						match = match[match.length - 1];
+						index = context.substring.indexOf(match);
+					}
+					break;
+			}
 
-function parseComment(context) {
-	//read "!--"
-	context.read(3);
+			data = index > -1 ? context.substring.slice(0, index) : context.substring;
+			break;
+	}
 
-	var match = /^([\s\S]*?)(?:$|-->)/.exec(context.substring);
-	var value = match[1];
-	context.read(match[0].length);
-	context.callbacks.comment(value);
-}
+	context.read(data.length);
+
+	switch (typeof end) {
+		case 'string':
+			end = end.length;
+			break;
+		case 'object':
+			end = end.exec(context.substring);
+			end = end[end.length - 1].length;
+			break;
+		case 'function':
+			end = end(context.substring);
+			break;
+	}
 
-function parseDocType(context) {
-	//read "!doctype"
-	context.read(8);
+	context.read(end);
 
-	var match = /^\s*([\s\S]*?)(?:$|>)/.exec(context.substring);
-	var value = match[1];
-	context.read(match[0].length);
-	context.callbacks.docType(value);
+	return data;
 }
 
 function parseXmlProlog(context) {
@@ -144,55 +182,56 @@ function callbackText(context) {
 }
 
 function parseNext(context) {
-	var current = context.current, buffer = current;
-	if (current == '<') {
-		buffer += context.read();
-		if (context.current === '/') {
-			buffer += context.read();
-			if (context.regex.name.test(context.current)) {
-				callbackText(context);
-				parseEndElement(context);
-			} else {
-				//malformed html
-				context.read();
-				appendText(buffer, context);
-			}
-		} else if (context.current === '!') {
-			if (/^!\[CDATA\[/.test(context.substring)) {
-				callbackText(context);
-				parseCData(context);
-			} else if (/^!--/.test(context.substring)) {
-				callbackText(context);
-				parseComment(context);
-			} else if (/^!doctype/i.test(context.substring)) {
-				callbackText(context);
-				parseDocType(context);
-			} else {
-				//malformed html
-				context.read();
-				appendText(buffer, context);
-			}
-		} else if (context.current === '?') {
-			if (/^\?xml/.test(context.substring)) {
-				callbackText(context);
-				parseXmlProlog(context);
-			} else {
-				//malformed xml prolog
-				context.read();
-				appendText(buffer, context);
-			}
-		} else if (context.regex.name.test(context.current)) {
+	if (context.current === '<') {
+		var next = context.substring.charAt(1);
+		if (next === '/' && context.regex.name.test(context.substring.charAt(2))) {
+			context.read(2);
+			callbackText(context);
+			parseEndElement(context);
+			return;
+		} else if (next === '?' && /^<\?xml/.test(context.substring)) {
+			context.read(1);
+			callbackText(context);
+			parseXmlProlog(context);
+			return;
+		} else if (context.regex.name.test(next)) {
+			context.read(1);
 			callbackText(context);
 			parseOpenElement(context);
-		} else {
-			//malformed html
-			context.read();
-			appendText(buffer, context);
+			return;
+		}
+	}
+
+	for (var callbackName in context.regex.dataElements) {
+		if (!context.regex.dataElements.hasOwnProperty(callbackName)) {
+			continue;
+		}
+
+		var dataElement = context.regex.dataElements[callbackName],
+			start = dataElement.start,
+			isValid = false;
+
+		switch (typeof start) {
+			case 'string':
+				isValid = context.substring.slice(0, start.length) === start;
+				break;
+			case 'object':
+				isValid = start.test(context.substring);
+				break;
+			case 'function':
+				isValid = start(context.substring) > -1;
+				break;
+		}
+
+		if (isValid) {
+			callbackText(context);
+			context.callbacks[callbackName](parseDataElement(context, dataElement));
+			return;
 		}
-	} else {
-		appendText(context.current, context);
-		context.read();
 	}
+
+	appendText(context.current, context);
+	context.read();
 }
 
 /**
@@ -215,6 +254,7 @@ function parseNext(context) {
  * @param {Object} [regex]
  * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
  * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
+ * @param {Object.<string,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
  */
 exports.parse = function(htmlString, callbacks, regex) {
 	htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
@@ -226,6 +266,13 @@ exports.parse = function(htmlString, callbacks, regex) {
 	callbackText(context);
 };
 
+/**
+ * @typedef {Object} DataElementConfig
+ * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
+ * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
+ * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
+ */
+
 /**
  * Parses the HTML contained in the given file asynchronously.
  *
diff --git a/tests/dataelement-tests.js b/tests/dataelement-tests.js
new file mode 100644
index 0000000..6de9bca
--- /dev/null
+++ b/tests/dataelement-tests.js
@@ -0,0 +1,95 @@
+var should = require('should');
+var helpers = require('./helpers');
+
+describe('dataElement', function () {
+    it('as string', function() {
+        var dataCount = 0;
+        helpers.parseString('<?php echo "<html></html>" ?>', {
+            php: function(value) {
+                value.should.equal(' echo "<html></html>" ');
+                dataCount++;
+            }
+        }, {
+            dataElements: {
+                php: {
+                    start: '<?php',
+                    end: '?>'
+                }
+            }
+        });
+
+        dataCount.should.equal(1);
+    });
+
+    it('as regex', function() {
+        var dataCount = 0, openCount = 0, closeCount = 0;
+        helpers.parseString('<foo><?= "<div>$var</div>" ?></foo>', {
+            openElement: function(name) {
+                name.should.equal('foo');
+                openCount++;
+            },
+            closeElement: function(name) {
+                name.should.equal('foo');
+                closeCount++;
+            },
+            phpEcho: function(value) {
+                openCount.should.equal(1);
+                closeCount.should.equal(0);
+                value.should.equal(' "<div>$var</div>" ');
+                dataCount++;
+            }
+        }, {
+            dataElements: {
+                phpEcho: {
+                    start: /^<\?=/,
+                    end: /\?>/
+                }
+            }
+        });
+
+        dataCount.should.equal(1);
+        openCount.should.equal(1);
+        closeCount.should.equal(1);
+    });
+
+    it('as function', function() {
+        var dataCount = 0;
+        helpers.parseString('<!-- test --><?php\nfoo\n?>', {
+            comment: function (value) {
+                value.should.equal(' test ');
+                dataCount++;
+            },
+            php: function(value) {
+                value.should.deepEqual({
+                    value: '\nfoo\n',
+                    length: '\nfoo\n'.length
+                });
+                dataCount++;
+            }
+        }, {
+            dataElements: {
+                php: {
+                    start: function (substring) {
+                        return substring.slice(0, 5) === '<?php' ? 5 : -1;
+                    },
+                    data: function (substring) {
+                        substring.should.equal('\nfoo\n?>');
+                        dataCount++;
+                        var index = substring.indexOf('?>');
+                        return {
+                            value: substring.slice(0, index),
+                            length: substring.slice(0, index).length
+                        };
+                    },
+                    end: function (substring) {
+                        substring.should.equal('?>');
+                        dataCount++;
+                        return 2;
+                    }
+                }
+            }
+        });
+
+        dataCount.should.equal(4);
+    });
+});
\ No newline at end of file