-
Notifications
You must be signed in to change notification settings - Fork 1
/
word_filters.py
56 lines (54 loc) · 2.91 KB
/
word_filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Define filters to exclude some kind of neighbors from the results.
All the filter are compliant with Neighbors class (Neighbors.py) and its filter's functions.
"""
from nltk import pos_tag, tokenize
def nltk_filter(w, candidates, granularity='exact', auxiliary_list=None):
"""
Nltk filter: given as input a list of words, it tags them with the function
nltk.pos_tag and discard those which are out of context with different granularities.
w: string
Word whose type is used for filtering.
candidates: list
List of words to filter, e.g. ['good', 'bad', 'better'].
granularity: string
optional: 'exact' will take in the result list only the words whose type is exactly
the same as input w, while 'smooth' will take all the words that are classified with
an acronym that shares the first 2 letters (e.g. as 'go' is classified as VRB and 'went'
as VRBP (past), with granularity='exact' we discard 'went', while we take it with
granularity='smooth'). It is possible to define a cutom file where to specify which category
can be included and which one excluded. Modify 'filters/nltk.cfg'.
neighbors_filters: list
optional: list of functions to filter the results: each function must recieve as input a list of words,
and outputs a filtered list (for example discarding words that are tagged differently by nltk.pos_tag function)
auxiliary_list: list
optional: list that has the same size of candidates and is used to return couples of (word, element) where also
elements in auxiliary_list are filtered.
Returns
-------
list of words filtered by their type plus the auxiliary list if auxiliary_list is not None.
"""
categories_allowed = {}
if granularity == 'custom':
lines = [line.rstrip('\n') for line in open("./filters/nltk.cfg","r")]
for l in lines:
c, tmp = l.replace(' ', '').split(':')
categories_allowed[c] = tmp.split(',')
category = pos_tag([w])[0][1] # extract category as the output is in the form [(<word>, <category>)]
tagged_candidates = pos_tag(candidates)
filtered_words, auxiliary_elements = [], []
for t,i in zip(tagged_candidates, range(len(tagged_candidates))):
if granularity == 'exact':
if t[1] == category:
filtered_words.append(t[0])
elif granularity == 'smooth':
if t[1][:2] == category[:2]:
filtered_words.append(t[0])
elif granularity == 'custom':
if t[1] in categories_allowed[category]:
filtered_words.append(t[0])
else:
raise Exception("Not Implemented Exception: we are working on it.")
if auxiliary_list != None:
auxiliary_elements.append(auxiliary_list[i])
return (filtered_words if auxiliary_list is None else (filtered_words, auxiliary_elements))