-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaggers.py
131 lines (115 loc) · 4.63 KB
/
taggers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
'''Parts-of-speech tagger implementations.'''
import random
import os.path
from collections import defaultdict
import pickle
from _perceptron import Perceptron
START = ['-START-', '-START2-']
END = ['-END-', '-END2-']
AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), 'trontagger.pickle')
class PerceptronTagger(object):
'''Greedy Averaged Perceptron tagger'''
def __init__(self, load=True):
self.model = Perceptron()
self.tagdict = {}
self.classes = set()
if load:
self.load(AP_MODEL_LOC)
def tag(self, words, tokenize=True):
prev, prev2 = START
tags = []
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tags.append(tag)
prev2 = prev; prev = tag
return tags
def start_training(self, sentences, quiet=True):
self._make_tagdict(sentences, quiet=quiet)
self.model.classes = self.classes
def train(self, sentences, save_loc=None, nr_iter=5, quiet=False):
'''Train a model from sentences, and save it at save_loc. nr_iter
controls the number of Perceptron training iterations.'''
self.start_training(sentences)
for iter_ in range(nr_iter):
c = 0; n = 0
for words, tags in sentences:
self.train_one(words, tags)
random.shuffle(sentences)
if not quiet:
print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n)))
def end_training(self, save_loc=None):
self.model.average_weights()
# Pickle as a binary file
if save_loc is not None:
pickle.dump((self.model.weights, self.tagdict, self.classes),
open(save_loc, 'wb'), -1)
def train_one(self, words, tags):
prev, prev2 = START
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev; prev = guess
def load(self, loc):
w_td_c = pickle.load(open(loc, 'rb'))
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes
def _normalize(self, word):
if '-' in word and word[0] != '-':
return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1
i += len(START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i-1])
add('i-1 suffix', context[i-1][-3:])
add('i-2 word', context[i-2])
add('i+1 word', context[i+1])
add('i+1 suffix', context[i+1][-3:])
add('i+2 word', context[i+2])
return features
def _make_tagdict(self, sentences, quiet=False):
'''Make a tag dictionary for single-tag words.'''
counts = defaultdict(lambda: defaultdict(int))
for words, tags in sentences:
for word, tag in zip(words, tags):
counts[word][tag] += 1
self.classes.add(tag)
freq_thresh = 20
ambiguity_thresh = 0.97
for word, tag_freqs in counts.items():
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
# Don't add rare words to the tag dictionary
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag
def _pc(n, d):
return (float(n) / d) * 100