Major Refactor for 2.0

unfoldingWord · Sep 16, 2019 · 83bfa09 · 83bfa09
1 parent d5f8ddd
commit 83bfa09
Show file tree

Hide file tree

Showing 11 changed files with 12,661 additions and 129 deletions.
diff --git a/README.md b/README.md
@@ -8,22 +8,21 @@ Small library that provides functions to tokenize a string into an array of word
 `npm install string-punctuation-tokenizer`
 
 ## Usage
-```js
-var stringTokenizer = require('string-punctuation-tokenizer');
-// or ES6 
-import stringTokenizer from 'string-punctuation-tokenizer'
-```
+`var stringTokenizer = require('string-punctuation-tokenizer');`
+
+or ES6 
+
+`import {tokenize} from 'string-punctuation-tokenizer';`
+
 #### Tokenize with punctuation
 ```js
-var words = stringTokenizer.tokenizeWithPunctuation('Hello world, my name is Manny!');
-// or ES6 
-let words = stringTokenizer.tokenizeWithPunctuation('Hello world, my name is Manny!');
+import {tokenize} from './src/tokenizers';
+let words = tokenize({text: 'Hello world, my name is Manny!', includePunctuation: true});
 // words = ["Hello", "world", ",", "my", "name", "is", "Manny", "!"]
 ```
 #### Tokenize without punctuation
 ```js
-var words = stringTokenizer.tokenize('Hello world, my name is Manny!');
-// or ES6 
-let words = stringTokenizer.tokenize('Hello world, my name is Manny!');
+import {tokenize} from './src/tokenizers';
+let words = tokenize({text: 'Hello world, my name is Manny!', includePunctuation: true});
 // words = ["Hello", "world", "my", "name", "is", "Manny"]
 ```
diff --git a/package.json b/package.json
@@ -4,14 +4,18 @@
   "description": "Small library that provides functions to tokenize a string into an array of words with or without punctuation",
   "main": "lib/index.js",
   "scripts": {
+    "start": "styleguidist server",
+    "styleguide:build": "styleguidist build",
+    "predeploy": "yarn styleguide:build",
+    "deploy": "gh-pages -d styleguide",
     "test": "eslint ./src && jest",
     "fix": "eslint ./src --fix",
     "build": "babel ./src -d ./lib --ignore '**/__tests__,**/__mocks__'",
     "compile": "rimraf lib && babel src/ -d lib/ --ignore '**/__tests__,**/__mocks__'",
     "prebuild": "rm -rf ./lib",
     "prepare": "if [ ! -d './lib/' ]; then npm run build; fi",
-    "prepublishOnly": "npm test && npm run compile",
-    "postpublish": "git tag v$npm_package_version && git push origin v$npm_package_version"
+    "prepublishOnly": "yarn test && yarn run compile",
+    "postpublish": "yarn deploy"
   },
   "jest": {
     "collectCoverageFrom": [
@@ -52,6 +56,12 @@
     "eslint": "^5.10.0",
     "eslint-config-google": "^0.9.1",
     "eslint-plugin-jest": "^22.1.2",
-    "jest": "^23.6.0"
+    "gh-pages": "^2.0.1",
+    "jest": "^23.6.0",
+    "lodash": "^4.17.11",
+    "react": "^16.8.6",
+    "react-dom": "^16.8.6",
+    "react-scripts": "2.1.8",
+    "react-styleguidist": "^9.0.5"
   }
 }
diff --git a/src/__tests__/tokenizers.test.js b/src/__tests__/tokenizers.test.js
diff --git a/src/components/tokenize/README.md b/src/components/tokenize/README.md
@@ -0,0 +1,8 @@
+Tokenize example:
+
+```js
+string = `\
+Hello world's!\
+`;
+<Tokenize string={string} />
+```
diff --git a/src/components/tokenize/Tokenize.js b/src/components/tokenize/Tokenize.js
@@ -0,0 +1,25 @@
+import React from 'react';
+
+import {tokenize} from '../../tokenizers';
+
+/**
+ * Adds two numbers together.
+ * @param {string} string the string input.
+ * @return {object} The react component.
+ */
+function Tokenize({
+  string,
+}) {
+  const tokens = tokenize(string);
+  const tokenItems = tokens.map((token) =>
+      <li>
+        {token}
+      </li>
+  );
+
+  return (
+    <ul>{tokenItems}</ul>
+  );
+}
+
+export default Tokenize;
diff --git a/src/docs/Greedy.md b/src/docs/Greedy.md
@@ -0,0 +1,29 @@
+## Greedy Examples:
+
+Edit the options and watch the effect on the output.
+
+```js
+import {tokenize} from '../tokenizers.js';
+
+const text = `“Didn’t David's 10,000 abençoando-os our h-e'a-rt's 'burn' disciples—and everyone else—what us?” -‭Luke‬ ‭2,4.3:2‬`;
+
+const options = {
+  text,
+  includeWords: true,
+  includeNumbers: true,
+  includeWhitespace: false,
+  includePunctuation: true,
+  greedy: true,
+  verbose: false,
+}
+const tokens = tokenize(options);
+const tokenItems = tokens.map(token => {
+  const string = (typeof token === 'string') ? token : JSON.stringify(token, null, 2);
+  return (<li>{string}</li>);
+});
+
+<>
+  <p>{text}</p>
+  <ol>{tokenItems}</ol>
+</>
+```
diff --git a/src/occurrences.js b/src/occurrences.js
@@ -2,28 +2,28 @@ import * as tokenizers from './tokenizers';
 
 /**
  * Gets the occurrence of a subString in a string by using the subString index in the string.
- * @param {String} string
+ * @param {String} text
  * @param {Number} currentWordIndex
  * @param {String} subString
  * @return {Object}
  */
-export const occurrenceInString = (string, currentWordIndex, subString) => {
+export const occurrenceInString = (text, currentWordIndex, subString) => {
   let occurrence = 0;
-  const tokens = tokenizers.tokenize(string);
+  const tokens = tokenizers.tokenize({text});
   for (let i = 0; i <= currentWordIndex; i++) {
     if (tokens[i] === subString) occurrence ++;
   }
   return occurrence;
 };
 /**
  * Function that count occurrences of a substring in a string
- * @param {String} string - The string to search in
+ * @param {String} text - The string to search in
  * @param {String} subString - The sub string to search for
  * @return {Integer} - the count of the occurrences
  */
-export const occurrencesInString = (string, subString) => {
+export const occurrencesInString = (text, subString) => {
   let occurrences = 0;
-  const tokens = tokenizers.tokenize(string);
+  const tokens = tokenizers.tokenize({text});
   tokens.forEach((token) => {
     if (token === subString) occurrences ++;
   });

diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -2,49 +2,53 @@ import xRegExp from 'xregexp';
 import {occurrenceInString, occurrencesInString} from './occurrences';
 // constants
 export const word = xRegExp('[\\pL\\pM\\u200D]+', '');
+export const greedyWord = xRegExp('([\\pL\\pM\\u200D]+([-\'’]?[\\pL\\pM\\u200D])+|[\\pL\\pM\\u200D]+)', '');
 export const punctuation = xRegExp('(^\\p{P}|[<>]{2})', '');
 export const whitespace = /\s+/;
 export const number = /\d+/;
+export const greedyNumber = /(\d+([:.,]?\d)+|\d+)/;
 export const number_ = xRegExp(number);
-const tokenizerOptions = {word, whitespace, punctuation, number};
 
 /**
  * Tokenize a string into an array of words
- * @param {String} string - string to be tokenized
+ * @param {Object} params - string to be tokenized
  * @return {Array} - array of tokenized words/strings
  */
-export const tokenize = (string) => {
-  const tokenTypes = ['word', 'number'];
-  const _tokens = classifyTokens(string, tokenizerOptions);
-  const tokens = _tokens.filter((token) => tokenTypes.includes(token.type))
-    .map((token) => token.token);
-  return tokens;
-};
-/**
- * Tokenize a string into an array of words
- * @param {String} string - string to be tokenized
- * @param {Object} options - include word occurrence or not.
- *        withWordOccurrence <boolean>
- * @return {Array} - array of tokenized words/strings
- */
-export const tokenizeWithPunctuation = (string, options) => {
-  const tokenTypes = ['word', 'number', 'punctuation'];
-  const _tokens = classifyTokens(string, tokenizerOptions);
-  const tokens = _tokens.filter((token) => tokenTypes.includes(token.type))
-    .map((token, index) => {
-      const occurrences = occurrencesInString(string, token.token);
-      const occurrence = occurrenceInString(string, index, token.token);
-      if (options && options.withWordOccurrence) {
-        return {
-          word: token.token,
-          type: token.type,
-          occurrence,
-          occurrences,
-        };
-      } else {
-        return token.token;
-      }
+export const tokenize = ({
+  text='',
+  includeWords=true,
+  includeNumbers=true,
+  includeWhitespace=false,
+  includePunctuation=false,
+  greedy=false,
+  verbose=false,
+  occurrences=false,
+  parsers={word, whitespace, punctuation, number},
+}) => {
+  const greedyParsers = {...parsers, word: greedyWord, number: greedyNumber};
+  const _parsers = greedy ? greedyParsers : parsers;
+  let tokens = classifyTokens(text, _parsers);
+  const types = [];
+  if (includeWords) types.push('word');
+  if (includeNumbers) types.push('number');
+  if (includeWhitespace) types.push('whitespace');
+  if (includePunctuation) types.push('punctuation');
+  tokens = tokens.filter((token) => types.includes(token.type));
+  if (occurrences) {
+    tokens = tokens.map((token, index) => {
+      const _occurrences = occurrencesInString(text, token.token);
+      const _occurrence = occurrenceInString(text, index, token.token);
+      return {...token, occurrence: _occurrence, occurrences: _occurrences};
+    });
+  }
+  if (verbose) {
+    tokens = tokens.map((token) => {
+      delete token.matches;
+      return token;
     });
+  } else {
+    tokens = tokens.map((token) => token.token);
+  }
   return tokens;
 };
 
@@ -65,9 +69,9 @@ export const classifyTokens = (string, parsers, deftok) => {
   let t;
   let tokens = [];
   while (string) {
-   t = null;
-   m = string.length;
-   for ( let key in parsers ) {
+  t = null;
+  m = string.length;
+  for ( let key in parsers ) {
     if (parsers.hasOwnProperty(key)) {
       r = parsers[key].exec( string );
       // try to choose the best match if there are several
@@ -81,20 +85,20 @@ export const classifyTokens = (string, parsers, deftok) => {
         m = r.index;
       }
     }
-   }
-   if ( m ) {
+  }
+  if ( m ) {
      // there is text between last token and currently
      // matched token - push that out as default or "unknown"
-     tokens.push({
-       token: string.substr( 0, m ),
-       type: deftok || 'unknown',
-     });
-   }
-   if ( t ) {
+    tokens.push({
+      token: string.substr( 0, m ),
+      type: deftok || 'unknown',
+    });
+  }
+  if ( t ) {
      // push current token onto sequence
-     tokens.push( t );
-   }
-   string = string.substr( m + (t ? t.token.length : 0) );
+    tokens.push( t );
+  }
+  string = string.substr( m + (t ? t.token.length : 0) );
   }
   return tokens;
 };
diff --git a/styleguide.config.js b/styleguide.config.js
@@ -0,0 +1,38 @@
+const Path = require('path');
+const upperFirst = require('lodash/upperFirst');
+const camelCase = require('lodash/camelCase');
+const { name, version, repository } = require('./package.json');
+const { styles, theme } = require('./styleguide.styles');
+
+let sections = [
+  {
+    name: 'README',
+    content: 'README.md',
+  },
+  {
+    name: 'Greedy',
+    content: 'src/docs/Greedy.md',
+  },
+];
+
+module.exports = {
+  title: `${upperFirst(camelCase(name))} v${version}`,
+  ribbon: {
+    url: repository.url,
+    text: 'View on GitHub'
+  },
+  webpackConfig: require('react-scripts/config/webpack.config')('development'),
+  // serverPort: 3000,
+  styles,
+  theme,
+  getComponentPathLine: (componentPath) => {
+    const dirname = Path.dirname(componentPath, '.js');
+    const file = dirname.split('/').slice(-1)[0];
+    const componentName = upperFirst(camelCase(file));
+    return `import { ${componentName} } from "${name}";`;
+  },
+  usageMode: 'expand',
+  exampleMode: 'expand',
+  pagePerSection: true,
+  sections,
+};