Skip to content

Commit

Permalink
Major Refactor for 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
klappy committed Sep 16, 2019
1 parent d5f8ddd commit 83bfa09
Show file tree
Hide file tree
Showing 11 changed files with 12,661 additions and 129 deletions.
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,21 @@ Small library that provides functions to tokenize a string into an array of word
`npm install string-punctuation-tokenizer`

## Usage
```js
var stringTokenizer = require('string-punctuation-tokenizer');
// or ES6
import stringTokenizer from 'string-punctuation-tokenizer'
```
`var stringTokenizer = require('string-punctuation-tokenizer');`

or ES6

`import {tokenize} from 'string-punctuation-tokenizer';`

#### Tokenize with punctuation
```js
var words = stringTokenizer.tokenizeWithPunctuation('Hello world, my name is Manny!');
// or ES6
let words = stringTokenizer.tokenizeWithPunctuation('Hello world, my name is Manny!');
import {tokenize} from './src/tokenizers';
let words = tokenize({text: 'Hello world, my name is Manny!', includePunctuation: true});
// words = ["Hello", "world", ",", "my", "name", "is", "Manny", "!"]
```
#### Tokenize without punctuation
```js
var words = stringTokenizer.tokenize('Hello world, my name is Manny!');
// or ES6
let words = stringTokenizer.tokenize('Hello world, my name is Manny!');
import {tokenize} from './src/tokenizers';
let words = tokenize({text: 'Hello world, my name is Manny!', includePunctuation: true});
// words = ["Hello", "world", "my", "name", "is", "Manny"]
```
16 changes: 13 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
"description": "Small library that provides functions to tokenize a string into an array of words with or without punctuation",
"main": "lib/index.js",
"scripts": {
"start": "styleguidist server",
"styleguide:build": "styleguidist build",
"predeploy": "yarn styleguide:build",
"deploy": "gh-pages -d styleguide",
"test": "eslint ./src && jest",
"fix": "eslint ./src --fix",
"build": "babel ./src -d ./lib --ignore '**/__tests__,**/__mocks__'",
"compile": "rimraf lib && babel src/ -d lib/ --ignore '**/__tests__,**/__mocks__'",
"prebuild": "rm -rf ./lib",
"prepare": "if [ ! -d './lib/' ]; then npm run build; fi",
"prepublishOnly": "npm test && npm run compile",
"postpublish": "git tag v$npm_package_version && git push origin v$npm_package_version"
"prepublishOnly": "yarn test && yarn run compile",
"postpublish": "yarn deploy"
},
"jest": {
"collectCoverageFrom": [
Expand Down Expand Up @@ -52,6 +56,12 @@
"eslint": "^5.10.0",
"eslint-config-google": "^0.9.1",
"eslint-plugin-jest": "^22.1.2",
"jest": "^23.6.0"
"gh-pages": "^2.0.1",
"jest": "^23.6.0",
"lodash": "^4.17.11",
"react": "^16.8.6",
"react-dom": "^16.8.6",
"react-scripts": "2.1.8",
"react-styleguidist": "^9.0.5"
}
}
124 changes: 62 additions & 62 deletions src/__tests__/tokenizers.test.js

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions src/components/tokenize/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Tokenize example:

```js
string = `\
Hello world's!\
`;
<Tokenize string={string} />
```
25 changes: 25 additions & 0 deletions src/components/tokenize/Tokenize.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import React from 'react';

import {tokenize} from '../../tokenizers';

/**
* Adds two numbers together.
* @param {string} string the string input.
* @return {object} The react component.
*/
function Tokenize({
string,
}) {
const tokens = tokenize(string);
const tokenItems = tokens.map((token) =>
<li>
{token}
</li>
);

return (
<ul>{tokenItems}</ul>
);
}

export default Tokenize;
29 changes: 29 additions & 0 deletions src/docs/Greedy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Greedy Examples:

Edit the options and watch the effect on the output.

```js
import {tokenize} from '../tokenizers.js';

const text = `“Didn’t David's 10,000 abençoando-os our h-e'a-rt's 'burn' disciples—and everyone else—what us?” -‭Luke‬ ‭2,4.3:2‬`;

const options = {
text,
includeWords: true,
includeNumbers: true,
includeWhitespace: false,
includePunctuation: true,
greedy: true,
verbose: false,
}
const tokens = tokenize(options);
const tokenItems = tokens.map(token => {
const string = (typeof token === 'string') ? token : JSON.stringify(token, null, 2);
return (<li>{string}</li>);
});

<>
<p>{text}</p>
<ol>{tokenItems}</ol>
</>
```
12 changes: 6 additions & 6 deletions src/occurrences.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@ import * as tokenizers from './tokenizers';

/**
* Gets the occurrence of a subString in a string by using the subString index in the string.
* @param {String} string
* @param {String} text
* @param {Number} currentWordIndex
* @param {String} subString
* @return {Object}
*/
export const occurrenceInString = (string, currentWordIndex, subString) => {
export const occurrenceInString = (text, currentWordIndex, subString) => {
let occurrence = 0;
const tokens = tokenizers.tokenize(string);
const tokens = tokenizers.tokenize({text});
for (let i = 0; i <= currentWordIndex; i++) {
if (tokens[i] === subString) occurrence ++;
}
return occurrence;
};
/**
* Function that count occurrences of a substring in a string
* @param {String} string - The string to search in
* @param {String} text - The string to search in
* @param {String} subString - The sub string to search for
* @return {Integer} - the count of the occurrences
*/
export const occurrencesInString = (string, subString) => {
export const occurrencesInString = (text, subString) => {
let occurrences = 0;
const tokens = tokenizers.tokenize(string);
const tokens = tokenizers.tokenize({text});
tokens.forEach((token) => {
if (token === subString) occurrences ++;
});
Expand Down
98 changes: 51 additions & 47 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,53 @@ import xRegExp from 'xregexp';
import {occurrenceInString, occurrencesInString} from './occurrences';
// constants
export const word = xRegExp('[\\pL\\pM\\u200D]+', '');
export const greedyWord = xRegExp('([\\pL\\pM\\u200D]+([-\'’]?[\\pL\\pM\\u200D])+|[\\pL\\pM\\u200D]+)', '');
export const punctuation = xRegExp('(^\\p{P}|[<>]{2})', '');
export const whitespace = /\s+/;
export const number = /\d+/;
export const greedyNumber = /(\d+([:.,]?\d)+|\d+)/;
export const number_ = xRegExp(number);
const tokenizerOptions = {word, whitespace, punctuation, number};

/**
* Tokenize a string into an array of words
* @param {String} string - string to be tokenized
* @param {Object} params - string to be tokenized
* @return {Array} - array of tokenized words/strings
*/
export const tokenize = (string) => {
const tokenTypes = ['word', 'number'];
const _tokens = classifyTokens(string, tokenizerOptions);
const tokens = _tokens.filter((token) => tokenTypes.includes(token.type))
.map((token) => token.token);
return tokens;
};
/**
* Tokenize a string into an array of words
* @param {String} string - string to be tokenized
* @param {Object} options - include word occurrence or not.
* withWordOccurrence <boolean>
* @return {Array} - array of tokenized words/strings
*/
export const tokenizeWithPunctuation = (string, options) => {
const tokenTypes = ['word', 'number', 'punctuation'];
const _tokens = classifyTokens(string, tokenizerOptions);
const tokens = _tokens.filter((token) => tokenTypes.includes(token.type))
.map((token, index) => {
const occurrences = occurrencesInString(string, token.token);
const occurrence = occurrenceInString(string, index, token.token);
if (options && options.withWordOccurrence) {
return {
word: token.token,
type: token.type,
occurrence,
occurrences,
};
} else {
return token.token;
}
export const tokenize = ({
text='',
includeWords=true,
includeNumbers=true,
includeWhitespace=false,
includePunctuation=false,
greedy=false,
verbose=false,
occurrences=false,
parsers={word, whitespace, punctuation, number},
}) => {
const greedyParsers = {...parsers, word: greedyWord, number: greedyNumber};
const _parsers = greedy ? greedyParsers : parsers;
let tokens = classifyTokens(text, _parsers);
const types = [];
if (includeWords) types.push('word');
if (includeNumbers) types.push('number');
if (includeWhitespace) types.push('whitespace');
if (includePunctuation) types.push('punctuation');
tokens = tokens.filter((token) => types.includes(token.type));
if (occurrences) {
tokens = tokens.map((token, index) => {
const _occurrences = occurrencesInString(text, token.token);
const _occurrence = occurrenceInString(text, index, token.token);
return {...token, occurrence: _occurrence, occurrences: _occurrences};
});
}
if (verbose) {
tokens = tokens.map((token) => {
delete token.matches;
return token;
});
} else {
tokens = tokens.map((token) => token.token);
}
return tokens;
};

Expand All @@ -65,9 +69,9 @@ export const classifyTokens = (string, parsers, deftok) => {
let t;
let tokens = [];
while (string) {
t = null;
m = string.length;
for ( let key in parsers ) {
t = null;
m = string.length;
for ( let key in parsers ) {
if (parsers.hasOwnProperty(key)) {
r = parsers[key].exec( string );
// try to choose the best match if there are several
Expand All @@ -81,20 +85,20 @@ export const classifyTokens = (string, parsers, deftok) => {
m = r.index;
}
}
}
if ( m ) {
}
if ( m ) {
// there is text between last token and currently
// matched token - push that out as default or "unknown"
tokens.push({
token: string.substr( 0, m ),
type: deftok || 'unknown',
});
}
if ( t ) {
tokens.push({
token: string.substr( 0, m ),
type: deftok || 'unknown',
});
}
if ( t ) {
// push current token onto sequence
tokens.push( t );
}
string = string.substr( m + (t ? t.token.length : 0) );
tokens.push( t );
}
string = string.substr( m + (t ? t.token.length : 0) );
}
return tokens;
};
38 changes: 38 additions & 0 deletions styleguide.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const Path = require('path');
const upperFirst = require('lodash/upperFirst');
const camelCase = require('lodash/camelCase');
const { name, version, repository } = require('./package.json');
const { styles, theme } = require('./styleguide.styles');

let sections = [
{
name: 'README',
content: 'README.md',
},
{
name: 'Greedy',
content: 'src/docs/Greedy.md',
},
];

module.exports = {
title: `${upperFirst(camelCase(name))} v${version}`,
ribbon: {
url: repository.url,
text: 'View on GitHub'
},
webpackConfig: require('react-scripts/config/webpack.config')('development'),
// serverPort: 3000,
styles,
theme,
getComponentPathLine: (componentPath) => {
const dirname = Path.dirname(componentPath, '.js');
const file = dirname.split('/').slice(-1)[0];
const componentName = upperFirst(camelCase(file));
return `import { ${componentName} } from "${name}";`;
},
usageMode: 'expand',
exampleMode: 'expand',
pagePerSection: true,
sections,
};
Loading

0 comments on commit 83bfa09

Please sign in to comment.