-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathner.py
135 lines (122 loc) · 6.25 KB
/
ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# ner.py
import argparse
import sys
import time
from nerdata import *
from utils import *
from models import *
from collections import Counter
from typing import List
def _parse_args():
"""
Command-line arguments to the system. --model switches between the main modes you'll need to use. The other arguments
are provided for convenience.
:return: the parsed args bundle
"""
parser = argparse.ArgumentParser(description='trainer.py')
parser.add_argument('--model', type=str, default='BAD', help='model to run (BAD, HMM, CRF)')
parser.add_argument('--inference', type=str, default='VITERBI', help='inference to run (VITERBI, BEAM, or BOTH)')
parser.add_argument('--train_path', type=str, default='data/eng.train', help='path to train set (you should not need to modify)')
parser.add_argument('--dev_path', type=str, default='data/eng.testa', help='path to dev set (you should not need to modify)')
parser.add_argument('--blind_test_path', type=str, default='data/eng.testb.blind', help='path to blind test set (you should not need to modify)')
parser.add_argument('--test_output_path', type=str, default='eng.testb.out', help='output path for test predictions')
parser.add_argument('--no_run_on_test', dest='run_on_test', default=True, action='store_false', help='skip printing output on the test set')
parser.add_argument('--silent', dest='silent', default=False, action='store_true', help="Silent default printing")
args = parser.parse_args()
return args
class BadNerModel(object):
"""
NER model that simply assigns each word its most likely observed tag in training
Attributes:
words_to_tag_counters: dictionary where each word (string) is mapped to a Counter over tags representing
counts observed in training
"""
def __init__(self, words_to_tag_counters):
self.words_to_tag_counters = words_to_tag_counters
def decode(self, sentence_tokens: List[Token]) -> LabeledSentence:
"""
:param sentence_tokens: List of the tokens in the sentence to tag
:return: The LabeledSentence consisting of predictions over the sentence
"""
pred_tags = []
for tok in sentence_tokens:
if tok.word in self.words_to_tag_counters:
# [0] selects the top most common (tag, count) pair, the next [0] picks out the tag itself
pred_tags.append(self.words_to_tag_counters[tok.word].most_common(1)[0][0])
else:
pred_tags.append("O")
return LabeledSentence(sentence_tokens, chunks_from_bio_tag_seq(pred_tags))
def decode_beam(self, sentence_tokens: List[Token]) -> LabeledSentence:
raise Exception("Not implemented because we don't need beam search in this model")
def train_bad_ner_model(training_set: List[LabeledSentence]) -> BadNerModel:
"""
:param training_set: labeled NER sentences to extract a BadNerModel from
:return: the BadNerModel based on counts collected from the training data
"""
words_to_tag_counters = {}
for sentence in training_set:
tags = sentence.get_bio_tags()
for idx in range(0, len(sentence)):
word = sentence.tokens[idx].word
if not word in words_to_tag_counters:
words_to_tag_counters[word] = Counter()
words_to_tag_counters[word][tags[idx]] += 1.0
return BadNerModel(words_to_tag_counters)
if __name__ == '__main__':
start_time = time.time()
args = _parse_args()
if not args.silent:
print(args)
# Load the training and test data
train = read_data(args.train_path)
dev = read_data(args.dev_path)
# Here's a few sentences...
if not args.silent:
print("Examples of sentences:")
print(str(dev[1]))
print(str(dev[3]))
print(str(dev[5]))
system_to_run = args.model
# If set to True, runs your CRF on the test set to produce final output
# Train our model
if system_to_run == "BAD":
bad_model = train_bad_ner_model(train)
dev_decoded = [bad_model.decode(test_ex.tokens) for test_ex in dev]
elif system_to_run == "HMM":
hmm_model = train_hmm_model(train, silent=args.silent)
dev_decoded = [hmm_model.decode(test_ex.tokens) for test_ex in dev]
dev_start_time = time.time()
print("Viterbi decoding took %f seconds" % (time.time() - dev_start_time))
print_evaluation(dev, dev_decoded)
elif system_to_run == "CRF":
crf_model = train_crf_model(train, silent=args.silent)
if not args.silent:
print("Data reading and training took %f seconds" % (time.time() - start_time))
if args.inference == "BEAM" or args.inference == "BOTH":
dev_start_time = time.time()
dev_decoded = [crf_model.decode_beam(test_ex.tokens) for test_ex in dev]
print("Beam decoding took %f seconds" % (time.time() - dev_start_time))
print_evaluation(dev, dev_decoded)
if args.inference == "VITERBI" or args.inference == "BOTH":
dev_start_time = time.time()
dev_decoded = [crf_model.decode(test_ex.tokens) for test_ex in dev]
print("Viterbi decoding took %f seconds" % (time.time() - dev_start_time))
print_evaluation(dev, dev_decoded)
if args.run_on_test:
if not args.silent:
print("Running on test")
test = read_data(args.blind_test_path)
# Only run inference with BEAM if it's specified; if BOTH, just use Viterbi
if args.inference == "BEAM":
beam_start_time = time.time()
test_decoded = [crf_model.decode_beam(test_ex.tokens) for test_ex in test]
print("Test beam decoding took %f seconds" % (time.time() - beam_start_time))
elif args.inference == "VITERBI" or args.inference == "BOTH":
viterbi_start_time = time.time()
test_decoded = [crf_model.decode(test_ex.tokens) for test_ex in test]
print("Test Viterbi decoding took %f seconds" % (time.time() - viterbi_start_time))
else:
raise Exception("Invalid inference mode")
print_output(test_decoded, args.test_output_path)
else:
raise Exception("Pass in either BAD, HMM, or CRF to run the appropriate system")