winkjs · sanjayaksaxena · Mar 18, 2024 · Sep 27, 2023 · Oct 1, 2023 · Oct 1, 2023
diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml
@@ -0,0 +1,30 @@
+on: ["push", "pull_request"]
+
+name: Coveralls
+
+jobs:
+
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+
+    - uses: actions/checkout@v1
+
+    - name: Use Node.js 18.x
+      uses: actions/setup-node@v3
+      with:
+        node-version: 18.x
+
+    - name: npm install
+      run: |
+        npm install
+        npm run pretest
+        npm run test
+
+    - name: Coveralls
+      uses: coverallsapp/github-action@v2
+      with:
+        format: lcov
+        debug: false
+        allow-empty: false
diff --git a/.github/workflows/node.js.yml b/.github/workflows/node.js.yml
@@ -0,0 +1,32 @@
+# This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs
+
+name: Node.js CI
+
+on:
+  push:
+    branches: [ "word-vectors" ]
+  pull_request:
+    branches: [ "word-vectors" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        node-version: [18.x]
+        # See supported Node.js release schedule at https://nodejs.org/en/about/releases/
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Use Node.js ${{ matrix.node-version }}
+      uses: actions/setup-node@v3
+      with:
+        node-version: ${{ matrix.node-version }}
+        cache: 'npm'
+    - run: npm ci
+    - run: npm run build --if-present
+    - run: npm run pretest
+    - run: npm run test
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # winkNLP
 
-### [![Build Status](https://travis-ci.com/winkjs/wink-nlp.svg?branch=master)](https://travis-ci.com/github/winkjs/wink-nlp) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-nlp/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-nlp?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/winkjs/wink-nlp/badge.svg)](https://snyk.io/test/github/winkjs/wink-nlp) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6035/badge)](https://bestpractices.coreinfrastructure.org/projects/6035) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) [![Follow on Twitter](https://img.shields.io/twitter/follow/winkjs_org?style=social)](https://twitter.com/winkjs_org)
+### [![Build Status](https://github.com/winkjs/wink-nlp/actions/workflows/node.js.yml/badge.svg)](https://github.com/winkjs/wink-nlp/actions/workflows/node.js.yml/) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-nlp/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-nlp?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/winkjs/wink-nlp/badge.svg)](https://snyk.io/test/github/winkjs/wink-nlp) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/6035/badge)](https://bestpractices.coreinfrastructure.org/projects/6035) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) [![Follow on Twitter](https://img.shields.io/twitter/follow/winkjs_org?style=social)](https://twitter.com/winkjs_org)
 
 ## Developer friendly Natural Language Processing ✨
 [<img align="right" src="https://decisively.github.io/wink-logos/logo-title.png" width="100px" >](https://winkjs.org/)

diff --git a/package.json b/package.json
@@ -27,7 +27,7 @@
   "main": "src/wink-nlp.js",
   "scripts": {
     "pretest": "npm run lint",
-    "test": "nyc --reporter=html --reporter=text mocha ./test/",
+    "test": "nyc --reporter=html --reporter=lcov --reporter=text mocha ./test/",
     "coverage": "nyc report --reporter=text-lcov | coveralls",
     "sourcedocs": "docker -i src -o ./sourcedocs --sidebar yes",
     "lint": "eslint ./src/*.js ./test/*.js",

diff --git a/src/allowed.js b/src/allowed.js
@@ -65,7 +65,8 @@ allowed.as4tokens = new Set( [
   as.freqTable,
   as.bigrams,
   as.unique,
-  as.markedUpText
+  as.markedUpText,
+  as.vector
 ] );
 
 // NOTE: it should exclude `as.markedUpText`, whenever this is included in the above.
@@ -76,7 +77,8 @@ allowed.as4selTokens = new Set( [
   as.bow,
   as.freqTable,
   as.bigrams,
-  as.unique
+  as.unique,
+  as.vector
 ] );
 
 allowed.its4entity = new Set( [

diff --git a/src/api/col-tokens-out.js b/src/api/col-tokens-out.js
@@ -53,15 +53,15 @@ var psMask = constants.psMask;
  * @private
  */
 var colTokensOut = function ( start, end, rdd, itsf, asf, addons ) {
-  // Vectors require completely different handling.
-  if ( itsf === its.vector ) {
-    return its.vector( start, end, rdd.tokens, addons );
-  }
-
   // Not a vector request, perform map-reduce.
   var mappedTkns = [];
   var itsfn = ( itsf && allowed.its4tokens.has( itsf ) ) ? itsf : its.value;
   var asfn = ( asf && allowed.as4tokens.has( asf ) ) ? asf : as.array;
+
+  if ( itsfn !== its.value && itsfn !== its.normal && itsfn !== its.lemma && asfn === as.vector ) {
+    throw Error( 'winkNLP: as.vector is allowed only with its value or normal or lemma.' );
+  }
+
   // Note, `as.text/markedUpText` needs special attention to include preceeding spaces.
   if ( asfn === as.text || asfn === as.markedUpText ) {
     for ( let i = start; i <= end; i += 1 ) {
@@ -73,7 +73,7 @@ var colTokensOut = function ( start, end, rdd, itsf, asf, addons ) {
     }
   }
 
-  return asfn( mappedTkns, rdd.markings, start, end );
+  return asfn( mappedTkns, rdd, start, end );
 }; // colTokensOut()
 
 module.exports = colTokensOut;
diff --git a/src/api/itm-document-out.js b/src/api/itm-document-out.js
@@ -47,10 +47,6 @@ var colTokensOut = require( './col-tokens-out.js' );
  */
 var itmDocumentOut = function ( rdd, itsf, addons ) {
   var document = rdd.document;
-  // Vectors require completely different handling.
-  if ( itsf === its.vector ) {
-    return its.vector( document, rdd, addons );
-  }
 
   var itsfn = ( itsf && allowed.its4document.has( itsf ) ) ? itsf : its.value;
 

diff --git a/src/api/itm-sentence-out.js b/src/api/itm-sentence-out.js
@@ -48,10 +48,6 @@ var colTokensOut = require( './col-tokens-out.js' );
  */
 var itmSentenceOut = function ( index, rdd, itsf, addons ) {
   var sentence = rdd.sentences[ index ];
-  // Vectors require completely different handling.
-  if ( itsf === its.vector ) {
-    return its.vector( sentence, rdd, addons );
-  }
 
   var itsfn = ( itsf && allowed.its4sentence.has( itsf ) ) ? itsf : its.value;
 

diff --git a/src/api/itm-token-out.js b/src/api/itm-token-out.js
@@ -45,10 +45,6 @@ var allowed = require( '../allowed.js' );
  * @private
  */
 var itmTokenOut = function ( index, rdd, itsf, addons ) {
-  // Vectors require completely different handling.
-  if ( itsf === its.vector ) {
-    return its.vector( index, rdd, addons );
-  }
   // Not a vector request, map using `itsf`.
   var f = ( allowed.its4token.has( itsf ) ) ? itsf : its.value;
   return f( index, rdd.tokens, rdd.cache, addons );

diff --git a/src/api/sel-tokens-out.js b/src/api/sel-tokens-out.js
@@ -52,16 +52,15 @@ var psMask = constants.psMask;
  * @private
  */
 var selTokensOut = function ( selTokens, rdd, itsf, asf, addons ) {
-  // Vectors require completely different handling.
-  if ( itsf === its.vector ) {
-    return its.vector( selTokens, rdd.tokens, addons );
-  }
-
   // Not a vector request, perform map-reduce.
   var mappedTkns = [];
   var itsfn = ( itsf && allowed.its4selTokens.has( itsf ) ) ? itsf : its.value;
   var asfn = ( asf && allowed.as4selTokens.has( asf ) ) ? asf : as.array;
 
+  if ( itsfn !== its.value && itsfn !== its.normal && itsfn !== its.lemma && asfn === as.vector ) {
+    throw Error( 'winkNLP: as.vector is allowed only with its value or normal or lemma.' );
+  }
+
   // Note, `as.text` needs special attention to include preceeding spaces.
   // No `markedUpText` allowed here.
   if ( asfn === as.text ) {
@@ -74,7 +73,7 @@ var selTokensOut = function ( selTokens, rdd, itsf, asf, addons ) {
     }
   }
 
-  return asfn( mappedTkns );
+  return asfn( mappedTkns, rdd );
 }; // selTokensOut()
 
 module.exports = selTokensOut;
diff --git a/src/as.js b/src/as.js
@@ -152,13 +152,15 @@ as.text = function ( twps ) {
  * `twps` and `markings`.
  *
  * @param  {array}  twps     Array containing tokens with preceding spaces.
- * @param  {array}  markings Array containing span of markings & marking specs.
+ * @param  {object}  rdd     Raw Document Data structure.
  * @param  {number} start    The start index of the tokens.
  * @param  {number} end      The end index of the tokens.
  * @return {string}          the markedup text.
  * @private
 */
-as.markedUpText = function ( twps, markings, start, end ) {
+as.markedUpText = function ( twps, rdd, start, end ) {
+  // Extract markings.
+  const markings = rdd.markings;
   // Offset to be added while computing `first` and `last` indexes of `twps`.
   var offset = start * 2;
   // Compute the `range` of `markings` to consider on the basis `start` and `end`.
@@ -183,4 +185,46 @@ as.markedUpText = function ( twps, markings, start, end ) {
   return twps.join( '' ).trim();
 }; // markedUpText()
 
+as.vector = function ( tokens, rdd ) {
+  // Get size of a vector from word vectors
+  const size = rdd.wordVectors.dimensions;
+  const precision = rdd.wordVectors.precision;
+  const vectors = rdd.wordVectors.vectors;
+  // Set up a new initialized vector of `size`
+  const v = new Array( size );
+  v.fill( 0 );
+  // Compute average.
+  // We will count the number of tokens as some of them may not have a vector.
+  let numOfTokens = 0;
+  for ( let i = 0; i < tokens.length; i += 1 ) {
+    // Extract token vector for the current token.
+    const tv = vectors[ tokens[ i ].toLowerCase() ];
+    // Increment `numOfTokens` if the above operation was successful.
+    if ( tv !== undefined ) numOfTokens += 1;
+    for ( let j = 0; j < size; j += 1 ) {
+      // Keep summing, eventually it will be divided by `numOfTokens` to obtain avareage.
+      v[ j ] += ( tv === undefined ) ? 0 : tv[ j ];
+    }
+  }
+
+  // if no token's vector is found, return a 0-vector!
+  if ( numOfTokens === 0 ) {
+    // Push l2Norm, which is 0 in this case.
+    v.push( 0 );
+    return v;
+  }
+
+  // Non-0 vector, find average by dividing the sum by numOfTokens
+  // also compute l2Norm.
+  let l2Norm = 0;
+  for ( let i = 0; i < size; i += 1 ) {
+    v[ i ] = +( v[ i ] / numOfTokens ).toFixed( precision );
+    l2Norm += v[ i ] * v[ i ];
+  }
+  // `l2Norm` becomes the 101th element for faster cosine similarity/normalization.
+  v.push( +( Math.sqrt( l2Norm ).toFixed( precision ) ) );
+
+  return v;
+}; // vector()
+
 module.exports = as;