Fixed Hindi tokenization issues with \u200D that should not break a w…

…ord.
unfoldingWord · Aug 7, 2018 · b9dec35 · b9dec35
1 parent 6bb69c3
commit b9dec35
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 91 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,5 @@
+## v0.9.0
+
+- Fixed Hindi tokenization issues with \u200D that should not break a word.
+  - http://unicode.scarfboy.com/?s=%E0%A4%B8%E0%A4%A8%E0%A5%8D%E2%80%8D%E0%A4%A4%E0%A4%BE%E0%A4%A8
+- Extracted Occurrences functions to separate file for better organization.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "string-punctuation-tokenizer",
-  "version": "0.8.0",
+  "version": "0.9.0",
   "description": "Small library that provides functions to tokenize a string into an array of words with or without punctuation",
   "main": "lib/index.js",
   "scripts": {

diff --git a/src/__tests__/occurrences.test.js b/src/__tests__/occurrences.test.js
@@ -0,0 +1,66 @@
+/* eslint-env jest */
+import * as occurrences from '../occurrences';
+
+const {
+  occurrenceInString,
+  occurrencesInString,
+} = occurrences;
+
+
+describe('occurrenceInString', function() {
+  it('should return occurrence for first of two', function() {
+    const string = 'a ab a';
+    const substring = 'a';
+    const index = 0;
+    const expected = 1;
+    const output = occurrenceInString(string, index, substring);
+    expect(output).toEqual(expected);
+  });
+  it('should return occurrence for second of two', function() {
+    const string = 'a ab a';
+    const substring = 'a';
+    const index = 2;
+    const expected = 2;
+    const output = occurrenceInString(string, index, substring);
+    expect(output).toEqual(expected);
+  });
+  it('should return occurrence for second of three', function() {
+    const string = 'a ab a bac a';
+    const substring = 'a';
+    const index = 2;
+    const expected = 2;
+    const output = occurrenceInString(string, index, substring);
+    expect(output).toEqual(expected);
+  });
+});
+
+describe('occurrencesInString', function() {
+  it('should return occurrences for none', function() {
+    const string = 'ab';
+    const substring = 'a';
+    const expected = 0;
+    const output = occurrencesInString(string, substring);
+    expect(output).toEqual(expected);
+  });
+  it('should return occurrences for one', function() {
+    const string = 'a ab';
+    const substring = 'a';
+    const expected = 1;
+    const output = occurrencesInString(string, substring);
+    expect(output).toEqual(expected);
+  });
+  it('should return occurrences for two', function() {
+    const string = 'a ab a';
+    const substring = 'a';
+    const expected = 2;
+    const output = occurrencesInString(string, substring);
+    expect(output).toEqual(expected);
+  });
+  it('should return occurrences for three', function() {
+    const string = 'a ab a bac a';
+    const substring = 'a';
+    const expected = 3;
+    const output = occurrencesInString(string, substring);
+    expect(output).toEqual(expected);
+  });
+});
diff --git a/__tests__/tokenizers.test.js → src/__tests__/tokenizers.test.js b/__tests__/tokenizers.test.js → src/__tests__/tokenizers.test.js
@@ -1,11 +1,9 @@
 /* eslint-env jest */
-import tokenizers from '../src/index';
+import * as tokenizers from '../tokenizers';
 
 const {
   tokenize,
   tokenizeWithPunctuation,
-  occurrenceInString,
-  occurrencesInString,
 } = tokenizers;
 
 describe('Tokenizer', function() {
@@ -117,60 +115,12 @@ describe('tokenizeWithPunctuation', function() {
   // });
 });
 
-describe('occurrenceInString', function() {
-  it('should return occurrence for first of two', function() {
-    const string = 'a ab a';
-    const substring = 'a';
-    const index = 0;
-    const expected = 1;
-    const output = occurrenceInString(string, index, substring);
-    expect(output).toEqual(expected);
-  });
-  it('should return occurrence for second of two', function() {
-    const string = 'a ab a';
-    const substring = 'a';
-    const index = 2;
-    const expected = 2;
-    const output = occurrenceInString(string, index, substring);
-    expect(output).toEqual(expected);
-  });
-  it('should return occurrence for second of three', function() {
-    const string = 'a ab a bac a';
-    const substring = 'a';
-    const index = 2;
-    const expected = 2;
-    const output = occurrenceInString(string, index, substring);
-    expect(output).toEqual(expected);
-  });
-});
 
-describe('occurrencesInString', function() {
-  it('should return occurrences for none', function() {
-    const string = 'ab';
-    const substring = 'a';
-    const expected = 0;
-    const output = occurrencesInString(string, substring);
-    expect(output).toEqual(expected);
-  });
-  it('should return occurrences for one', function() {
-    const string = 'a ab';
-    const substring = 'a';
-    const expected = 1;
-    const output = occurrencesInString(string, substring);
-    expect(output).toEqual(expected);
-  });
-  it('should return occurrences for two', function() {
-    const string = 'a ab a';
-    const substring = 'a';
-    const expected = 2;
-    const output = occurrencesInString(string, substring);
-    expect(output).toEqual(expected);
-  });
-  it('should return occurrences for three', function() {
-    const string = 'a ab a bac a';
-    const substring = 'a';
-    const expected = 3;
-    const output = occurrencesInString(string, substring);
-    expect(output).toEqual(expected);
+describe('Indic Languages Issues', function() {
+  it('\\u200D ZERO WIDTH JOINER', function() {
+    const string = 'अब्राहम की सन्‍तान, दाऊद की सन्‍तान, यीशु मसीह की वंशावली।';
+    const tokens = tokenize(string);
+    const expected = ['अब्राहम', 'की', 'सन्‍तान', 'दाऊद', 'की', 'सन्‍तान', 'यीशु', 'मसीह', 'की', 'वंशावली'];
+    expect(tokens).toEqual(expected);
   });
 });
diff --git a/src/index.js b/src/index.js
@@ -1,13 +1,16 @@
 import {
   tokenize,
   tokenizeWithPunctuation,
-  occurrenceInString,
-  occurrencesInString,
   word,
   punctuation,
   whitespace,
 } from './tokenizers';
 
+import {
+  occurrenceInString,
+  occurrencesInString,
+} from './tokenizers';
+
 import {
   selectionArray,
   spliceStringOnRanges,

diff --git a/src/occurrences.js b/src/occurrences.js
@@ -0,0 +1,31 @@
+import * as tokenizers from './tokenizers';
+
+/**
+ * Gets the occurrence of a subString in a string by using the subString index in the string.
+ * @param {String} string
+ * @param {Number} currentWordIndex
+ * @param {String} subString
+ * @return {Object}
+ */
+export const occurrenceInString = (string, currentWordIndex, subString) => {
+  let occurrence = 0;
+  const tokens = tokenizers.tokenize(string);
+  for (let i = 0; i <= currentWordIndex; i++) {
+    if (tokens[i] === subString) occurrence ++;
+  }
+  return occurrence;
+};
+/**
+ * Function that count occurrences of a substring in a string
+ * @param {String} string - The string to search in
+ * @param {String} subString - The sub string to search for
+ * @return {Integer} - the count of the occurrences
+ */
+export const occurrencesInString = (string, subString) => {
+  let occurrences = 0;
+  const tokens = tokenizers.tokenize(string);
+  tokens.forEach((token) => {
+    if (token === subString) occurrences ++;
+  });
+  return occurrences;
+};
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -1,6 +1,7 @@
 import xRegExp from 'xregexp';
+import {occurrenceInString, occurrencesInString} from './occurrences';
 // constants
-export const word = xRegExp('[\\pL\\pM]+', '');
+export const word = xRegExp('[\\pL\\pM\\u200D]+', '');
 export const punctuation = xRegExp('(^\\p{P}|[<>]{2})', '');
 export const whitespace = /\s+/;
 const tokenizerOptions = {word, whitespace, punctuation};
@@ -42,35 +43,6 @@ export const tokenizeWithPunctuation = (string, options) => {
     });
   return tokens;
 };
-/**
- * Gets the occurrence of a subString in a string by using the subString index in the string.
- * @param {String} string
- * @param {Number} currentWordIndex
- * @param {String} subString
- * @return {Object}
- */
-export const occurrenceInString = (string, currentWordIndex, subString) => {
-  let occurrence = 0;
-  const tokens = tokenize(string);
-  for (let i = 0; i <= currentWordIndex; i++) {
-    if (tokens[i] === subString) occurrence ++;
-  }
-  return occurrence;
-};
-/**
- * Function that count occurrences of a substring in a string
- * @param {String} string - The string to search in
- * @param {String} subString - The sub string to search for
- * @return {Integer} - the count of the occurrences
- */
-export const occurrencesInString = (string, subString) => {
-  let occurrences = 0;
-  const tokens = tokenize(string);
-  tokens.forEach((token) => {
-    if (token === subString) occurrences ++;
-  });
-  return occurrences;
-};
 
 /**
  * Tiny tokenizer - https://gist.github.com/borgar/451393