forked from KBNLresearch/keyword-generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
keywords_tfidf.py
96 lines (78 loc) · 2.92 KB
/
keywords_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
# Keyword Generator
#
# Copyright (C) 2015 Juliette Lonij, Koninklijke Bibliotheek -
# National Library of the Netherlands
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import codecs
import corpus as cp
import csv
import gensim
import math
import operator
import os
import sys
import time
# Generate keywords
def generate_keywords(tfidf_scores, num_keywords):
print('Generating keywords...')
keywords = {}
# Sum of scores for token in all documents
for doc in tfidf_scores:
for t in doc:
key = t[0]
score = t[1]
if key in keywords:
keywords[key] += score
else:
keywords[key] = score
# Sort keywords by highest score
sorted_keywords = sorted(keywords.items(), key=operator.itemgetter(1),
reverse=True)
return sorted_keywords[:num_keywords]
def print_keywords(keywords, dictionary):
print('Keywords generated:')
for i, k in enumerate(keywords):
print('(%i) %s [%s]' % (i + 1, dictionary.get(k[0]), k[1]))
def save_keywords(keywords, dictionary):
timestamp = int(time.time())
with open('data' + os.sep + 'results' + os.sep + str(timestamp) +
'_keywords' + '.csv', 'wb') as f:
csv_writer = csv.writer(f, delimiter='\t')
for k in keywords:
csv_writer.writerow([dictionary.get(k[0]).encode('utf-8'),
str(k[1])])
if __name__ == '__main__':
if sys.stdout.encoding != 'UTF-8':
sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
parser = argparse.ArgumentParser()
parser.add_argument('-k', required=False, type=int, default=10,
help='number of keywords')
parser.add_argument('-d', required=False, type=int, default=0,
help='document length')
args = parser.parse_args()
num_keywords, doc_length = vars(args)['k'], vars(args)['d']
doc_folder = 'data' + os.sep + 'documents'
stop_folder = 'data' + os.sep + 'stop_words'
corpus, dictionary = cp.MyCorpus(doc_folder, stop_folder, doc_length).load()
tfidf = gensim.models.TfidfModel(corpus)
tfidf_scores = tfidf[corpus]
keywords = generate_keywords(tfidf_scores, num_keywords)
print_keywords(keywords, dictionary)
save_keywords(keywords, dictionary)