Skip to content

Commit

Permalink
Fixed Hindi tokenization issues with \u200D that should not break a w…
Browse files Browse the repository at this point in the history
…ord.
  • Loading branch information
klappy committed Aug 7, 2018
1 parent 6bb69c3 commit b9dec35
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 91 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## v0.9.0

- Fixed Hindi tokenization issues with \u200D that should not break a word.
- http://unicode.scarfboy.com/?s=%E0%A4%B8%E0%A4%A8%E0%A5%8D%E2%80%8D%E0%A4%A4%E0%A4%BE%E0%A4%A8
- Extracted Occurrences functions to separate file for better organization.
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "string-punctuation-tokenizer",
"version": "0.8.0",
"version": "0.9.0",
"description": "Small library that provides functions to tokenize a string into an array of words with or without punctuation",
"main": "lib/index.js",
"scripts": {
Expand Down
66 changes: 66 additions & 0 deletions src/__tests__/occurrences.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/* eslint-env jest */
import * as occurrences from '../occurrences';

const {
occurrenceInString,
occurrencesInString,
} = occurrences;


describe('occurrenceInString', function() {
it('should return occurrence for first of two', function() {
const string = 'a ab a';
const substring = 'a';
const index = 0;
const expected = 1;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
it('should return occurrence for second of two', function() {
const string = 'a ab a';
const substring = 'a';
const index = 2;
const expected = 2;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
it('should return occurrence for second of three', function() {
const string = 'a ab a bac a';
const substring = 'a';
const index = 2;
const expected = 2;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
});

describe('occurrencesInString', function() {
it('should return occurrences for none', function() {
const string = 'ab';
const substring = 'a';
const expected = 0;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for one', function() {
const string = 'a ab';
const substring = 'a';
const expected = 1;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for two', function() {
const string = 'a ab a';
const substring = 'a';
const expected = 2;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for three', function() {
const string = 'a ab a bac a';
const substring = 'a';
const expected = 3;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
});
64 changes: 7 additions & 57 deletions __tests__/tokenizers.test.js → src/__tests__/tokenizers.test.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
/* eslint-env jest */
import tokenizers from '../src/index';
import * as tokenizers from '../tokenizers';

const {
tokenize,
tokenizeWithPunctuation,
occurrenceInString,
occurrencesInString,
} = tokenizers;

describe('Tokenizer', function() {
Expand Down Expand Up @@ -117,60 +115,12 @@ describe('tokenizeWithPunctuation', function() {
// });
});

describe('occurrenceInString', function() {
it('should return occurrence for first of two', function() {
const string = 'a ab a';
const substring = 'a';
const index = 0;
const expected = 1;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
it('should return occurrence for second of two', function() {
const string = 'a ab a';
const substring = 'a';
const index = 2;
const expected = 2;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
it('should return occurrence for second of three', function() {
const string = 'a ab a bac a';
const substring = 'a';
const index = 2;
const expected = 2;
const output = occurrenceInString(string, index, substring);
expect(output).toEqual(expected);
});
});

describe('occurrencesInString', function() {
it('should return occurrences for none', function() {
const string = 'ab';
const substring = 'a';
const expected = 0;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for one', function() {
const string = 'a ab';
const substring = 'a';
const expected = 1;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for two', function() {
const string = 'a ab a';
const substring = 'a';
const expected = 2;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
});
it('should return occurrences for three', function() {
const string = 'a ab a bac a';
const substring = 'a';
const expected = 3;
const output = occurrencesInString(string, substring);
expect(output).toEqual(expected);
describe('Indic Languages Issues', function() {
it('\\u200D ZERO WIDTH JOINER', function() {
const string = 'अब्राहम की सन्‍तान, दाऊद की सन्‍तान, यीशु मसीह की वंशावली।';
const tokens = tokenize(string);
const expected = ['अब्राहम', 'की', 'सन्‍तान', 'दाऊद', 'की', 'सन्‍तान', 'यीशु', 'मसीह', 'की', 'वंशावली'];
expect(tokens).toEqual(expected);
});
});
7 changes: 5 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import {
tokenize,
tokenizeWithPunctuation,
occurrenceInString,
occurrencesInString,
word,
punctuation,
whitespace,
} from './tokenizers';

import {
occurrenceInString,
occurrencesInString,
} from './tokenizers';

import {
selectionArray,
spliceStringOnRanges,
Expand Down
31 changes: 31 additions & 0 deletions src/occurrences.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as tokenizers from './tokenizers';

/**
* Gets the occurrence of a subString in a string by using the subString index in the string.
* @param {String} string
* @param {Number} currentWordIndex
* @param {String} subString
* @return {Object}
*/
export const occurrenceInString = (string, currentWordIndex, subString) => {
let occurrence = 0;
const tokens = tokenizers.tokenize(string);
for (let i = 0; i <= currentWordIndex; i++) {
if (tokens[i] === subString) occurrence ++;
}
return occurrence;
};
/**
* Function that count occurrences of a substring in a string
* @param {String} string - The string to search in
* @param {String} subString - The sub string to search for
* @return {Integer} - the count of the occurrences
*/
export const occurrencesInString = (string, subString) => {
let occurrences = 0;
const tokens = tokenizers.tokenize(string);
tokens.forEach((token) => {
if (token === subString) occurrences ++;
});
return occurrences;
};
32 changes: 2 additions & 30 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import xRegExp from 'xregexp';
import {occurrenceInString, occurrencesInString} from './occurrences';
// constants
export const word = xRegExp('[\\pL\\pM]+', '');
export const word = xRegExp('[\\pL\\pM\\u200D]+', '');
export const punctuation = xRegExp('(^\\p{P}|[<>]{2})', '');
export const whitespace = /\s+/;
const tokenizerOptions = {word, whitespace, punctuation};
Expand Down Expand Up @@ -42,35 +43,6 @@ export const tokenizeWithPunctuation = (string, options) => {
});
return tokens;
};
/**
* Gets the occurrence of a subString in a string by using the subString index in the string.
* @param {String} string
* @param {Number} currentWordIndex
* @param {String} subString
* @return {Object}
*/
export const occurrenceInString = (string, currentWordIndex, subString) => {
let occurrence = 0;
const tokens = tokenize(string);
for (let i = 0; i <= currentWordIndex; i++) {
if (tokens[i] === subString) occurrence ++;
}
return occurrence;
};
/**
* Function that count occurrences of a substring in a string
* @param {String} string - The string to search in
* @param {String} subString - The sub string to search for
* @return {Integer} - the count of the occurrences
*/
export const occurrencesInString = (string, subString) => {
let occurrences = 0;
const tokens = tokenize(string);
tokens.forEach((token) => {
if (token === subString) occurrences ++;
});
return occurrences;
};

/**
* Tiny tokenizer - https://gist.github.com/borgar/451393
Expand Down

0 comments on commit b9dec35

Please sign in to comment.