Skip to content

Commit

Permalink
Starting the keyword-extraction namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Jan 30, 2017
1 parent 6589b30 commit edadb24
Show file tree
Hide file tree
Showing 6 changed files with 548 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ TODO.md
/helpers
/inflectors
/keyers
/keyword-extraction
/metrics
/parsers
/phonetics
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"url": "git+https://github.com/yomguithereal/talisman.git"
},
"talisman": {
"folders": "classification clustering features hash helpers inflectors keyers metrics parsers phonetics regexp stats stemmers tag tokenizers"
"folders": "classification clustering features hash helpers inflectors keyers keyword-extraction metrics parsers phonetics regexp stats stemmers tag tokenizers"
},
"keywords": [
"bayes",
Expand Down
76 changes: 76 additions & 0 deletions src/keyword-extraction/rake.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* Talisman keyword-extraction/rake
* =================================
*
* JavaScript implementation of the "Rapid Automatic Keyword Extraction" (RAKE).
*
* [Article]:
* Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword
* Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.),
* Text Mining: Theory and Applications: John Wiley & Sons.
*/

// TODO: doesn't need to tokenize sentence I guess...
// TODO: need to hash the phrases

/**
* Constants.
*/
const PUNCTUATION = /^[^\w\s]+$/;

/**
* Factory function taking some options & returning a custom RAKE function.
*
* @param {object} options - Options:
* @param {array} stopwords - List of stopwords to use.
*/
export default function createExtractor(options) {
options = options || {};

const stopwords = options.stopwords;

if (!Array.isArray(stopwords))
throw new Error('talisman/keyword-extraction/rake: expecting a list of stopwords.');

const stopwordsSet = new Set(stopwords);

/**
* RAKE function taking an array of sentences being tokenized as words.
* Note that the tokenization must keep punctuation in order to be able
* to extract phrases.
*
* Alternatively, one can also stem the given tokens beforehand to minimize
* the number of distinct keyword words.
*
* @param {array} doc - Target document.
* @return {array} - Resulting keywords.
*/
return function(doc) {

//-- 1) We need to find candidate phrases by splitting tokens by stopwords
const candidatePhrases = [];

for (let i = 0, l = doc.length; i < l; i++) {
const sentence = doc[i];

let phrase = [];

for (let j = 0, m = sentence.length; j < m; j++) {
const word = sentence[j];

if (stopwordsSet.has(word) || PUNCTUATION.test(word)) {
if (phrase.length) {
candidatePhrases.push(phrase);
phrase = [];
}
}
else {
phrase.push(word);
}
}
}

// console.log(candidatePhrases);
return candidatePhrases;
};
}
Loading

0 comments on commit edadb24

Please sign in to comment.