Skip to content

Commit

Permalink
Benchmarks and optimized occurrences.
Browse files Browse the repository at this point in the history
  • Loading branch information
klappy committed Sep 16, 2019
1 parent 9cc1591 commit 4df0ee5
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 50 deletions.
51 changes: 51 additions & 0 deletions src/docs/Benchmarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

```js
import {tokenize, word, number, punctuation, whitespace} from '../tokenizers.js';

const text = `It's said that th\u200Dere are 1,000.00 different ways,\nto say...\t"I—Love—You."`;

var iterations = 10000;
let start, end;

start = performance.now();
for(var i = 0; i < iterations; i++ ){
const options = {text};
const tokens = tokenize(options);
const output = JSON.stringify(tokens, null, 2);
};
end = performance.now();
const defaultOptions = end - start;

start = performance.now();
for(var i = 0; i < iterations; i++ ){
const options = {
text,
greedy: true,
};
const tokens = tokenize(options);
const output = JSON.stringify(tokens, null, 2);
};
end = performance.now();
const greedy = end - start;

start = performance.now();
for(var i = 0; i < iterations; i++ ){
const options = {
text,
verbose: true,
occurrences: true,
};
const tokens = tokenize(options);
const output = JSON.stringify(tokens, null, 2);
};
end = performance.now();
const occurrences = end - start;

// wrapped in a React fragment for rendering:
<>
<h4>Each run {iterations} times.</h4>
<p><strong>default:</strong> {defaultOptions.toFixed(0)}ms</p>
<p><strong>greedy:</strong> {greedy.toFixed(0)}ms</p>
<p><strong>occurrences:</strong> {occurrences.toFixed(0)}ms</p>
</>
```
81 changes: 42 additions & 39 deletions src/occurrences.js
Original file line number Diff line number Diff line change
@@ -1,66 +1,69 @@
import * as tokenizers from './tokenizers';
import {tokenize} from './tokenizers';

/**
* Gets the occurrence of a subString in a string by using the subString index in the string.
* @param {String} text
* @param {Array} tokens
* @param {Number} currentWordIndex
* @param {String} subString
* @param {Object} options - The options for the tokenizer
* @return {Object}
*/
export const occurrenceInString = (
text,
export const occurrenceInTokens = (
tokens,
currentWordIndex,
subString,
options={
includeWords: true,
includeNumbers: true,
},
) => {
const _options = {
text,
includeWords: options.includeWords,
includeNumbers: options.includeNumbers,
includePunctuation: options.includePunctuation,
includeWhitespace: options.includeWhitespace,
greedy: options.greedy,
parsers: options.parsers,
};
let occurrence = 0;
const tokens = tokenizers.tokenize(_options);
for (let i = 0; i <= currentWordIndex; i++) {
if (tokens[i] === subString) occurrence ++;
if (tokens[i].token === subString) occurrence ++;
}
return occurrence;
};

/**
* Function that count occurrences of a substring in a string
* @param {String} text - The string to search in
* @param {Array} tokens - The string to search in
* @param {String} subString - The sub string to search for
* @param {Object} options - The options for the tokenizer
* @return {Integer} - the count of the occurrences
*/
export const occurrencesInString = (
text,
export const occurrencesInTokens = (
tokens,
subString,
options={
includeWords: true,
includeNumbers: true,
},
) => {
const _options = {
text,
includeWords: options.includeWords,
includeNumbers: options.includeNumbers,
includePunctuation: options.includePunctuation,
includeWhitespace: options.includeWhitespace,
greedy: options.greedy,
parsers: options.parsers,
};
let occurrences = 0;
const tokens = tokenizers.tokenize(_options);
tokens.forEach((token) => {
if (token === subString) occurrences ++;
if (token.token === subString) occurrences ++;
});
return occurrences;
};

/**
* Gets the occurrence of a subString in a string by using the subString index in the string.
* @param {String} text
* @param {Number} currentWordIndex
* @param {String} subString
* @return {Object}
*/
export const occurrenceInString = (
text,
currentWordIndex,
subString,
) => {
const tokens = tokenize({text, verbose: true});
const occurrence = occurrenceInTokens(tokens, currentWordIndex, subString);
return occurrence;
};

/**
* Function that count occurrences of a substring in a string
* @param {String} text - The string to search in
* @param {String} subString - The sub string to search for
* @return {Integer} - the count of the occurrences
*/
export const occurrencesInString = (
text,
subString,
) => {
const tokens = tokenize({text, verbose: true});
const occurrences = occurrencesInTokens(tokens, subString);
return occurrences;
};
14 changes: 3 additions & 11 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import xRegExp from 'xregexp';
import {occurrenceInString, occurrencesInString} from './occurrences';
import {occurrenceInTokens, occurrencesInTokens} from './occurrences';
// constants
export const word = xRegExp('[\\pL\\pM\\u200D\\u2060]+', '');
export const greedyWord = xRegExp('([\\pL\\pM\\u200D\\u2060]+([-\'’]?[\\pL\\pM\\u200D\\u2060])+|[\\pL\\pM\\u200D\\u2060]+)', '');
Expand Down Expand Up @@ -36,16 +36,8 @@ export const tokenize = ({
tokens = tokens.filter((token) => types.includes(token.type));
if (occurrences) {
tokens = tokens.map((token, index) => {
const options = {
includeWords,
includeNumbers,
includePunctuation,
includeWhitespace,
greedy,
parsers,
};
const _occurrences = occurrencesInString(text, token.token, options);
const _occurrence = occurrenceInString(text, index, token.token, options);
const _occurrences = occurrencesInTokens(tokens, token.token);
const _occurrence = occurrenceInTokens(tokens, index, token.token);
return {...token, occurrence: _occurrence, occurrences: _occurrences};
});
}
Expand Down
4 changes: 4 additions & 0 deletions styleguide.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ let sections = [
name: 'Zero Width Joiners',
content: 'src/docs/ZeroWidthJoiners.md',
},
{
name: 'Benchmarks',
content: 'src/docs/Benchmarks.md',
},
];

module.exports = {
Expand Down

0 comments on commit 4df0ee5

Please sign in to comment.