-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
147 lines (127 loc) · 5.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import string
import codecs
import numpy as np
from collections import defaultdict, Counter
from config import *
from codecs import open
def read_word_embeddings(fname):
word_vec_map = {}
word_id_map = {}
with open(fname, 'r', encoding='utf-8') as in_f:
for idx, line in enumerate(in_f):
fields = line.strip().split()
word = fields[0]
embedding = np.asarray([float(i) for i in fields[1:]], dtype=np.float32)
word_vec_map[word] = embedding
word_id_map[word] = len(word_id_map)
# get dimensions from last word added
vec_dim = len(word_vec_map[word])
return word_vec_map, word_id_map, vec_dim
def load_character_data(fname, char_to_id, max_sent_len, max_word_len=32):
X = []
with open(fname, 'r', encoding='utf-8') as in_f:
for line in in_f:
sent_id, sentence, category, intensity = line.strip().split('\t')
if args.bytes:
char_ids = [char_to_id[byte] for char in sentence for byte in map(ord, char.encode('utf8'))]
else:
char_ids = [char_to_id[char] for char in sentence]
X.append([char_to_id[SENT_START]]+char_ids+[char_to_id[SENT_END]])
return X
# sent_lens = [len(s) for s in X]
# max_sent_len_data = max(sent_lens)
# percentile = int(np.percentile(sent_lens, 90))
#
# # cutoff for max_sent_len
# discarded_tokens = 0
# used_tokens = 0
# cut_X = []
# for sent in X:
# discarded_tokens += max(len(sent) - max_word_len, 0)
# used_tokens += len(sent[:max_word_len])
# cut_X.append(sent[:max_word_len])
#
# if __debug__:
# print('max len in dataset: {0}\t90-percentile: {2}\tmax len used: {1}'.format(max_sent_len_data, max_word_len, percentile))
# print('n discarded toks:\t{0}'.format(discarded_tokens))
# print('n used toks:\t\t{0}'.format(used_tokens))
# print('prop used toks:\t\t{0}'.format(used_tokens / float(used_tokens+discarded_tokens)))
#
# return cut_X
def load_word_data(fname, word_to_id, tag_to_id, max_sent_len, is_training=False):
"""
Reading of CoNLL file, converting to word ids.
If is_training, unknown mwes will be added to the embedding dictionary using a heuristic.
"""
print("reading ",fname)
X, y = [], []
with open(fname, 'r', encoding='utf-8') as in_f:
curr_X, curr_y = [], []
for line in in_f:
#line = line.strip()
#if len(line) == 0: continue
sent_id, sentence, category, intensity = line.strip().split('\t')
for token in sentence.split(): # TODO: Real tokenisation
if not args.words:
curr_X.append(word_to_id[UNKNOWN])
# Some preprocessing
if re.match('^[0-9\.\,-]+$', token):
curr_X.append(word_to_id[NUMBER])
else:
#token = ''.join(ch for ch in token if ch not in exclude)
if is_training or token.lower() in word_to_id:# and args.mwe and ('~' in token or '-' in token):
curr_X.append(word_to_id[token.lower()])
#curr_X.append(attempt_reconstruction(token, word_to_id))
else:
#print("unk*****", token) #if token not in embeddings it's UNK (or mwu if option off)
curr_X.append(word_to_id[UNKNOWN])
#curr_y.append(tag_to_id[tag])
#if args.shorten_sents and len(curr_X) >= max_sent_len-2:
X.append(curr_X)
y.append(tag_to_id[intensity])
curr_X = []#word_to_id[SENT_CONT]]
#curr_y = []#tag_to_id[SENT_CONT]]
## get some stats on dataset
sent_lens = [len(s) for s in X]
max_sent_len_data = max(sent_lens)
percentile = int(np.percentile(sent_lens, 90))
## max sentence cutoff
# Two options: either discard sentences (as in earlier code), or use all up to max_sent_len (
# Leave room for padding
old_len = len(X)
discarded_tokens = 0
total_toks = 0
for idx, s in enumerate(X):
total_toks += len(s)
if len(s) > max_sent_len-2:
discarded_tokens += len(s) - (max_sent_len - 2)
if is_training:
to_use = set([key for key, val in Counter([word for sentence in X for word in sentence]).most_common(args.nwords)])
id_to_word = dict([(value, key) for key, value in word_to_id.items()])
filtered_word_to_id = defaultdict(lambda: len(filtered_word_to_id))
word_filtered = []
for sentence in X:
filtered_sent = []
for word_id in sentence:
if word_id in to_use:
filtered_sent.append(filtered_word_to_id[id_to_word[word_id]])
word_filtered.append(filtered_sent)
word_to_id = filtered_word_to_id
X = word_filtered
X = [s[:max_sent_len-2] for s in X]
#y = [s[:max_sent_len-2] for s in y]
if __debug__:
print('max len in dataset: {0}\t90-percentile: {2}\tmax len used: {1}'.format(max_sent_len_data, max_sent_len, percentile))
print('n discarded sents: {0}'.format(old_len - len(X)))
print('n discarded toks: {0}'.format(discarded_tokens))
if args.bookkeeping:
dsetname = os.path.basename(fname).rstrip('.conllu')
save_ids(word_to_id, tag_to_id, dsetname)
return X, y, word_to_id, tag_to_id
def save_ids(word_to_id, tag_to_id, fname):
write_mapping(word_to_id, experiment_dir+'/{0}_word2id.txt'.format(fname))
write_mapping(tag_to_id, experiment_dir+'/{0}_tag2id.txt'.format(fname))