-
Notifications
You must be signed in to change notification settings - Fork 5
/
index.js
365 lines (263 loc) · 9.62 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
/*
==========================================
TextRank: Bringing Order into Texts
Performs sentence extraction only.
Used for automatic article summarization.
==========================================
*/
// Article is a string of text to summarize
exports.TextRank = function (article, settings) {
this.printError = function (msg) {
console.log("TextRank ERROR:", msg);
}
if(typeof article != "string") {
this.printError("Article Must Be Type String");
return;
}
if(article.length < 1){
this.printError("Article Can't Be Empty");
return;
}
if(!settings){
settings = {};
}
this.extractAmount = (settings["extractAmount"])? settings["extractAmount"] : 5;
// Random surfer model, used in the similarity scoring function
this.d = (settings["d"])? settings["d"] : 0.85;
// Set the similarity function for edge weighting
this.userDefinedSimilarity = (settings["sim"])? settings["sim"] : null;
// Tokens are a sentence [ sentence1, sentence2, sentence3, ... , sentenceN ]
this.userDefinedTokens = (settings["tokens"])? settings["tokens"]: null;
// Split are the sentences tokenized into words [[word1, word2, ... , wordN],[word1, word2, ... , wordN], ..., [word1, word2, ... , wordN]]
this.userDefinedTokensSplit = (settings["split"])? settings["split"]: null;
this.typeOfSummary = (settings["summaryType"])? 1 : 0;
this.graph = {
V: {}, // Sentences are the vertices of the graph
E: {},
numVerts: 0
}
this.summarizedArticle = "";
// convergence threshold
this.delta = 0.0001
// Constructs the graph
this.setupGraph = function (article) {
// The TextPreprocesser cleans up and tokenizes the article
this.graph.V = TextPreprocesser(article, this.userDefinedTokens, this.userDefinedTokensSplit);
this.graph.numVerts = Object.keys(this.graph.V).length;
// Check for user defined similarity function
this.sim = (this.userDefinedSimilarity != null)? this.userDefinedSimilarity : this.similarityScoring;
// Init vertex scores
for(iIndex in this.graph.V) {
var vertex = this.graph.V[iIndex];
// The initial score of a vertex is random and does not matter for the TextRank algorithm
vertex["score"] = Math.random() * 10 + 1;
// Id is the sentence position starting from 0
vertex["id"] = Number(iIndex);
var Si = vertex;
// Add an edge between every sentence in the graph
// Fully connected graph
for (var j = 0; j < this.graph.numVerts; j++) {
var jIndex = j.toString();
// No self edges
if(jIndex != iIndex) {
// If no edge list, create it
if(!this.graph.E[iIndex]) {
this.graph.E[iIndex] = {};
}
var Sj = this.graph.V[jIndex];
// Compute the edge weight between two sentences in the graph
this.graph.E[iIndex][jIndex] = this.sim(Si, Sj);
}
}
}
}
// Given two sentences compute a score which is the weight on the edge between the two sentence
// Implementation of Similarity(Si, Sj) function defined in the paper
this.similarityScoring = function (Si, Sj) {
var overlap = {}
var Si_tokens = Si.tokens;
var Sj_tokens = Sj.tokens;
// Count words for sentence i
for(var i = 0; i < Si_tokens.length; i++) {
var word = Si_tokens[i];
if(!overlap[word]) {
overlap[word] = {}
}
overlap[word]['i'] = 1;
}
// Count words for sentence j
for(var i = 0; i < Sj_tokens.length; i++) {
var word = Sj_tokens[i];
if(!overlap[word]) {
overlap[word] = {}
}
overlap[word]['j'] = 1;
}
var logLengths = Math.log(Si_tokens.length) + Math.log(Sj_tokens.length);
var wordOverlapCount = 0;
// Compute word overlap from the sentences
for( index in overlap) {
var word = overlap[index]
if ( Object.keys(word).length === 2) {
wordOverlapCount++;
}
}
// Compute score
return wordOverlapCount/logLengths;
}
this.iterations = 0;
this.iterateAgain = true;
// The Weighted Graph WS(Vi) function to score a vertex
this.iterate = function () {
for(index in this.graph.V){
var vertex = this.graph.V[index]; // Vi vertex
var score_0 = vertex.score;
var vertexNeighbors = this.graph.E[index]; // In(Vi) set
var summedNeighbors = 0;
// Sum over In(Vi)
for (neighborIndex in vertexNeighbors) {
var neighbor = vertexNeighbors[neighborIndex]; // Vj
var wji = this.graph.E[index][neighborIndex]; // wji
// Sum over Out(Vj)
var outNeighbors = this.graph.E[neighborIndex];
var summedOutWeight = 1; // Stores the summation of weights over the Out(Vj)
for( outIndex in outNeighbors) {
summedOutWeight += outNeighbors[outIndex];
}
var WSVertex = this.graph.V[neighborIndex].score; // WS(Vj)
summedNeighbors += (wji/summedOutWeight) * WSVertex;
}
var score_1 = (1 - this.d) + this.d * summedNeighbors; // WS(Vi)
// Update the score on the vertex
this.graph.V[index].score = score_1;
// Check to see if you should continue
if(Math.abs(score_1 - score_0) <= this.delta) {
this.iterateAgain = false;
}
}
// Check for another iteration
if(this.iterateAgain == true) {
this.iterations += 1;
this.iterate();
}else {
// Prints only once
// console.log(this.iterations);
}
return;
}
// Extracts the top N sentences
this.extractSummary = function (N) {
var sentences = [];
// Graph all the sentences
for ( index in this.graph.V) {
sentences.push(this.graph.V[index]);
}
// Sort the sentences based off the score of the vertex
sentences = sentences.sort( function (a,b) {
if (a.score > b.score) {
return -1;
}else {
return 1;
}
});
// Grab the top N sentences
// var sentences = sentences.slice(0,0+(N));
sentences.length = N;
// Sort based of the id which is the position of the sentence in the original article
sentences = sentences.sort(function (a,b) {
if (a.id < b.id) {
return -1;
} else {
return 1;
}
})
var summary = null;
if(this.typeOfSummary) {
summary = [];
for (var i = 0; i < sentences.length; i++) {
summary.push(sentences[i].sentence);
}
} else {
// Compose the summary by joining the ranked sentences
var summary = sentences[0].sentence;
for (var i = 1; i < sentences.length; i++) {
summary += " " + sentences[i].sentence;
}
}
return summary;
}
this.run = function (article) {
// Create graph structure
this.setupGraph(article);
// Rank sentences
this.iterate();
this.summarizedArticle = this.extractSummary(this.extractAmount);
}
this.run(article);
}
// Handles the preprocessing of text for creating the graph structure of TextRank
function TextPreprocesser(article, userTokens, userTokensSplit) {
// Fucntion to clean up anything with the article that is passed in.
this.cleanArticle = function (article) {
// Regex to remove two or more spaces in a row.
return article.replace(/[ ]+(?= )/g, "");
}
// tokenizer takes a string {article} and turns it into an array of sentences
// tokens are sentences, must end with (!?.) characters
this.tokenizer = function(article) {
return article.replace(/([ ][".A-Za-z-|0-9]+[!|.|?|"](?=[ ]["“A-Z]))/g, "$1|").split("|");
}
// Cleans up the tokens
// tokens are sentences
this.cleanTokens = function(tokens) {
// Iterate backwards to allow for splicing.
for (var i = tokens.length - 1; i >= 0; i--) {
// Current Token
var token = tokens[i]
// Empty String
if(token == "") {
tokens.splice(i,1);
}else { // Since string is not empty clean it up
// Remove all spaces leading the sentence
tokens[i] = token.replace(/[ .]*/,"")
}
}
return tokens;
}
// given a sentence, split it up into the amount of words in the sentence
this.tokenizeASentence = function(sentence) {
// lowercase all the words in the sentences
var lc_sentence = sentence.toLowerCase();
/*
Regex Expression Below :
Example: cool, awesome, something else, and yup
The delimiters like commas (,) (:) (;) etc ... need to be removed
When scoring sentences against each other you do not want to compare
{cool,} against {cool} because they will not match since the comma stays with {cool,}
*/
// put spaces between all characters to split into words
var replaceToSpaceWithoutAfterSpace = /[-|'|"|(|)|/|<|>|,|:|;](?! )/g;
lc_sentence = lc_sentence.replace(replaceToSpaceWithoutAfterSpace," ");
// Now replace all characters with blank
var replaceToBlankWithCharacters = /[-|'|"|(|)|/|<|>|,|:|;]/g;
lc_sentence = lc_sentence.replace(replaceToBlankWithCharacters,"");
// Split into the words based off spaces since cleaned up
return lc_sentence.split(" ");
}
this.outputPreprocess = function(article) {
var cleanedArticle = this.cleanArticle(article);
// Check for user tokens
var usingUserDefinedTokens = (userTokens && userTokensSplit);
var tokens = (usingUserDefinedTokens)? userTokens : this.cleanTokens(this.tokenizer(cleanedArticle));
var output = {};
for (var i = 0; i < tokens.length; i++) {
var tokenizedSentence = (usingUserDefinedTokens)? userTokensSplit[i]: this.tokenizeASentence(tokens[i]);
output[i] = {
sentence: tokens[i],
tokens: tokenizedSentence
};
}
return output;
}
return this.outputPreprocess(article);
}