-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
142 lines (131 loc) · 5.49 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
indexer exposes a function inex_docs which if given a data file,
processes every line of that file, builds an inverted index from unigrams to a list of Document objects.
"""
from collections import Counter
from typing import List, Dict, Tuple
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from pymongo.collection import Collection
from pymongo.errors import DuplicateKeyError
from index_interface import TermInDocument, SearchResult
def get_title_and_body(content: str) -> Tuple[str, str]:
title, body = content.split(" ", 1)
# remove leading quote in title
if title[0] == '"':
title = title[1:].lstrip()
# remove trailing quote in body
if body[-1] == '"':
body = body[:-1].rstrip()
# restrict title to at most 10 words
if len(title) > 10:
title = " ".join(title.split(" ")[:10])
return title, body
def upload(collection: Collection, filename: str) -> int:
# uploads articles from file into the mongodb collection and returns a count
with open(filename) as fp:
cnt = 0
for line in fp:
if len(line) == 0:
continue
fields = line.split(",", 1)
docid = fields[0].strip()
content = fields[1].strip()
title, body = get_title_and_body(content)
document = {"_id": docid, "title": title, "body": body}
try:
collection.insert_one(document)
except DuplicateKeyError:
collection.update_one({"_id": docid}, {"$set": {"title": title, "body": body}})
cnt += 1
return cnt
# Processing:
# 1. Find unigrams in content.
# 2. Remove stop words
# 3. Compute the position fraction of each unigram in the document. This is 1 if the it is the
# first word and close to 0 if it is the last word.
# 4. Compute the frequency of each unigram in the document
# 5. For each every unigram maintain a list of documents that the unigram can be found in.
# 5b. In addition to the docid, keep other metadata like the frequency of the unigram in
# the document, the position of the unigram in the document.
def create_unigram_index(articles: Collection, unigrams: Collection) -> Tuple[int, int]:
"""
Create a unigram TF index from the aricle collection and store in unigram collection
Returns the number of unigrams and documents
"""
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
posting_list = {}
doccnt = 0
for doc in articles.find():
doccnt += 1
content = doc["title"] + " " + doc["body"]
word_tokens = [
ps.stem(x.lower()) for x in word_tokenize(content) if x not in stop_words
]
doclen = len(content) # will be used to compute position feature
wcount = 0
for unigram in word_tokens:
wcount += 1
# process unigram
if unigram not in posting_list:
posting_list[unigram] = []
if (len(posting_list[unigram]) == 0) or (
posting_list[unigram][-1].docid != doc["_id"]
):
# first occurrence of word/unigram in document
posting_list[unigram].append(
TermInDocument(doc["_id"], doc["title"], 1, float(doclen - wcount) / doclen, 0.0)
)
else:
posting_list[unigram][-1].term_freq += 1
# Now we have processed all docs
# Compute the score for all documents of all unigrams
# Sort all posting lists based on score.
# weights to be learned
w_tf = 0.3
w_pos = 0.7
unicnt = 0
for unigram in posting_list:
unicnt += 1
for doc in posting_list[unigram]:
doc.score = w_tf * doc.term_freq + w_pos * doc.position
posting_list[unigram] = sorted(posting_list[unigram], key=lambda x: x.score, reverse=True)
unigrams.insert_one(
{
"_id": unigram,
"docs": [
{
"docid": doc.docid,
"title": doc.title,
"term_freq": doc.term_freq,
"position": doc.position,
"score": doc.score,
}
for doc in posting_list[unigram]
],
}
)
return (unicnt, doccnt)
def search_unigrams(expr: str, offset: int, limit: int, unigrams: Collection) -> List[SearchResult]:
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
# Convert to lowercase and also stem the words
search_words = [ps.stem(x.lower()) for x in word_tokenize(expr)]
# Remove stop words from the set of words to search
search_words = [x for x in search_words if x not in stop_words]
# Add the score for each unigram
doc_scores = Counter()
docid_to_title = {}
docid_to_position = {}
for word in search_words:
result = unigrams.find_one({"_id": word})
if result is not None:
for doc in result["docs"]:
# Score of a document is the sum of scores of the document over all searched terms
doc_scores[doc["docid"]] += doc["score"]
docid_to_title[doc["docid"]] = doc["title"]
docid_to_position[doc["docid"]] = doc["position"]
final_results = [SearchResult(docid, docid_to_title[docid], score, docid_to_position[docid],) for docid, score in doc_scores.most_common(offset+limit)]
return final_results[offset:offset+limit]