Benchmarks and optimized occurrences.

unfoldingWord · Sep 16, 2019 · 4df0ee5 · 4df0ee5
1 parent 9cc1591
commit 4df0ee5
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 50 deletions.
diff --git a/src/docs/Benchmarks.md b/src/docs/Benchmarks.md
@@ -0,0 +1,51 @@
+
+```js
+import {tokenize, word, number, punctuation, whitespace} from '../tokenizers.js';
+
+const text = `It's said that th\u200Dere are 1,000.00 different ways,\nto say...\t"I—Love—You."`;
+
+var iterations = 10000;
+let start, end;
+
+start = performance.now();
+for(var i = 0; i < iterations; i++ ){
+  const options = {text};
+  const tokens = tokenize(options);
+  const output = JSON.stringify(tokens, null, 2);
+};
+end = performance.now();
+const defaultOptions = end - start;
+
+start = performance.now();
+for(var i = 0; i < iterations; i++ ){
+  const options = {
+    text,
+    greedy: true,
+  };
+  const tokens = tokenize(options);
+  const output = JSON.stringify(tokens, null, 2);
+};
+end = performance.now();
+const greedy = end - start;
+
+start = performance.now();
+for(var i = 0; i < iterations; i++ ){
+  const options = {
+    text,
+    verbose: true,
+    occurrences: true,
+  };
+  const tokens = tokenize(options);
+  const output = JSON.stringify(tokens, null, 2);
+};
+end = performance.now();
+const occurrences = end - start;
+
+// wrapped in a React fragment for rendering:
+<>
+  <h4>Each run {iterations} times.</h4>
+  <p><strong>default:</strong> {defaultOptions.toFixed(0)}ms</p>
+  <p><strong>greedy:</strong> {greedy.toFixed(0)}ms</p>
+  <p><strong>occurrences:</strong> {occurrences.toFixed(0)}ms</p>
+</>
+```
diff --git a/src/occurrences.js b/src/occurrences.js
@@ -1,66 +1,69 @@
-import * as tokenizers from './tokenizers';
+import {tokenize} from './tokenizers';
 
 /**
  * Gets the occurrence of a subString in a string by using the subString index in the string.
- * @param {String} text
+ * @param {Array} tokens
  * @param {Number} currentWordIndex
  * @param {String} subString
- * @param {Object} options - The options for the tokenizer
  * @return {Object}
  */
-export const occurrenceInString = (
-  text,
+export const occurrenceInTokens = (
+  tokens,
   currentWordIndex,
   subString,
-  options={
-    includeWords: true,
-    includeNumbers: true,
-  },
 ) => {
-  const _options = {
-    text,
-    includeWords: options.includeWords,
-    includeNumbers: options.includeNumbers,
-    includePunctuation: options.includePunctuation,
-    includeWhitespace: options.includeWhitespace,
-    greedy: options.greedy,
-    parsers: options.parsers,
-  };
   let occurrence = 0;
-  const tokens = tokenizers.tokenize(_options);
   for (let i = 0; i <= currentWordIndex; i++) {
-    if (tokens[i] === subString) occurrence ++;
+    if (tokens[i].token === subString) occurrence ++;
   }
   return occurrence;
 };
+
 /**
  * Function that count occurrences of a substring in a string
- * @param {String} text - The string to search in
+ * @param {Array} tokens - The string to search in
  * @param {String} subString - The sub string to search for
- * @param {Object} options - The options for the tokenizer
  * @return {Integer} - the count of the occurrences
  */
-export const occurrencesInString = (
-  text,
+export const occurrencesInTokens = (
+  tokens,
   subString,
-  options={
-    includeWords: true,
-    includeNumbers: true,
-  },
 ) => {
-  const _options = {
-    text,
-    includeWords: options.includeWords,
-    includeNumbers: options.includeNumbers,
-    includePunctuation: options.includePunctuation,
-    includeWhitespace: options.includeWhitespace,
-    greedy: options.greedy,
-    parsers: options.parsers,
-  };
   let occurrences = 0;
-  const tokens = tokenizers.tokenize(_options);
   tokens.forEach((token) => {
-    if (token === subString) occurrences ++;
+    if (token.token === subString) occurrences ++;
   });
   return occurrences;
 };
+
+/**
+ * Gets the occurrence of a subString in a string by using the subString index in the string.
+ * @param {String} text
+ * @param {Number} currentWordIndex
+ * @param {String} subString
+ * @return {Object}
+ */
+export const occurrenceInString = (
+  text,
+  currentWordIndex,
+  subString,
+) => {
+  const tokens = tokenize({text, verbose: true});
+  const occurrence = occurrenceInTokens(tokens, currentWordIndex, subString);
+  return occurrence;
+};
+
+/**
+ * Function that count occurrences of a substring in a string
+ * @param {String} text - The string to search in
+ * @param {String} subString - The sub string to search for
+ * @return {Integer} - the count of the occurrences
+ */
+export const occurrencesInString = (
+  text,
+  subString,
+) => {
+  const tokens = tokenize({text, verbose: true});
+  const occurrences = occurrencesInTokens(tokens, subString);
+  return occurrences;
+};
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -1,5 +1,5 @@
 import xRegExp from 'xregexp';
-import {occurrenceInString, occurrencesInString} from './occurrences';
+import {occurrenceInTokens, occurrencesInTokens} from './occurrences';
 // constants
 export const word = xRegExp('[\\pL\\pM\\u200D\\u2060]+', '');
 export const greedyWord = xRegExp('([\\pL\\pM\\u200D\\u2060]+([-\'’]?[\\pL\\pM\\u200D\\u2060])+|[\\pL\\pM\\u200D\\u2060]+)', '');
@@ -36,16 +36,8 @@ export const tokenize = ({
   tokens = tokens.filter((token) => types.includes(token.type));
   if (occurrences) {
     tokens = tokens.map((token, index) => {
-      const options = {
-        includeWords,
-        includeNumbers,
-        includePunctuation,
-        includeWhitespace,
-        greedy,
-        parsers,
-      };
-      const _occurrences = occurrencesInString(text, token.token, options);
-      const _occurrence = occurrenceInString(text, index, token.token, options);
+      const _occurrences = occurrencesInTokens(tokens, token.token);
+      const _occurrence = occurrenceInTokens(tokens, index, token.token);
       return {...token, occurrence: _occurrence, occurrences: _occurrences};
     });
   }

diff --git a/styleguide.config.js b/styleguide.config.js
@@ -21,6 +21,10 @@ let sections = [
     name: 'Zero Width Joiners',
     content: 'src/docs/ZeroWidthJoiners.md',
   },
+  {
+    name: 'Benchmarks',
+    content: 'src/docs/Benchmarks.md',
+  },
 ];
 
 module.exports = {