forked from sebastianruder/learn-to-select-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures.py
211 lines (184 loc) · 9.4 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Methods for generating features that can be used for Bayesian Optimization.
"""
import numpy as np
import similarity
from constants import SIMILARITY_FUNCTIONS, DIVERSITY_FEATURES
def get_feature_representations(feature_names, examples, trg_examples, vocab,
word2vec=None, topic_vectorizer=None,
lda_model=None, lowercase=True):
"""
Retrieve the feature representations of a list of examples.
:param feature_names: a list containing the names of features to be used
:param examples: a list of tokenized documents of all source domains
:param trg_examples: a list of tokenized documents of the target domain
:param vocab: the Vocabulary object
:param word2vec: a mapping of a word to its word vector representation
(e.g. GloVe or word2vec)
:param topic_vectorizer: the CountVectorizer object used to transform
tokenized documents for LDA
:param lda_model: the trained LDA model
:param lowercase: lower-case the input examples for source and target
domains
:return: the feature representations of the examples as a 2d numpy array of
shape (num_examples, num_features)
"""
features = np.zeros((len(examples), len(feature_names)))
if lowercase:
print('Lower-casing the source and target domain examples...')
examples = [[word.lower() for word in example] for example in examples]
trg_examples = [[word.lower() for word in trg_example]
for trg_example in trg_examples]
# get the term distribution of the entire training data
# (a numpy array of shape (vocab_size,) )
train_term_dist = similarity.get_term_dist(examples, vocab)
# get the term distribution of each training example
# (a numpy array of shape (num_examples, vocab_size) )
term_dists = np.array([similarity.get_term_dist(
[example], vocab) for example in examples])
# get the term distribution of the target data
# (a numpy array of shape (vocab_size, ) )
trg_term_dist = similarity.get_term_dist(trg_examples, vocab)
# get the topic distributions of the train and target data
topic_dists, trg_topic_dist = None, None
if any(f_name.startswith('topic') for f_name in feature_names):
print('Retrieving the topic distributions of source and target data...')
topic_dists = similarity.get_topic_distributions(
examples, topic_vectorizer, lda_model)
trg_topic_dist = np.mean(similarity.get_topic_distributions(
trg_examples, topic_vectorizer, lda_model), axis=0)
print('Shape of topic distributions:', topic_dists.shape)
print('Shape of target topic dist:', trg_topic_dist.shape)
# get the word embedding representations of the train and target data
word_reprs, trg_word_repr = None, None
if any(f_name.startswith('word_embedding') for f_name in feature_names):
print('Retrieving the word embedding representations of source and '
'target data...')
word_reprs = similarity.weighted_sum_of_embeddings(
examples, vocab.word2id, word2vec, train_term_dist)
trg_word_repr = np.mean(similarity.weighted_sum_of_embeddings(
trg_examples, vocab.word2id, word2vec, trg_term_dist), axis=0)
print('Shape of word representations:', word_reprs.shape)
print('Shape of target word representation:', trg_word_repr.shape)
for i in range(len(examples)):
for j, f_name in enumerate(feature_names):
# check whether feature belongs to similarity-based features,
# diversity-based features, etc.
if f_name.startswith('topic'):
f = similarity.similarity_name2value(
f_name.split('_')[1], topic_dists[i], trg_topic_dist)
elif f_name.startswith('word_embedding'):
f = similarity.similarity_name2value(
f_name.split('_')[2], word_reprs[i], trg_word_repr)
elif f_name in SIMILARITY_FUNCTIONS:
f = similarity.similarity_name2value(
f_name, term_dists[i], trg_term_dist)
elif f_name in DIVERSITY_FEATURES:
f = diversity_feature_name2value(
f_name, examples[i], train_term_dist, vocab.word2id, word2vec)
else:
raise ValueError('%s is not a valid feature name.' % f_name)
assert not np.isnan(f), 'Error: Feature %s is nan.' % f_name
assert not np.isinf(f), 'Error: Feature %s is inf or -inf.' % f_name
features[i, j] = f
if i % 100 == 0 and i > 0:
print('Created features for %d examples.' % i)
return features
def get_feature_names(feature_set_names):
"""
Given a list of feature sets, return the list of all their feature names.
:param feature_set_names: a list of feature set names,
e.g. 'similarity', 'diversity', etc.
:return: a list of feature names
"""
features = []
if 'similarity' in feature_set_names:
print('Using similarity-based features...')
features += SIMILARITY_FUNCTIONS
if 'topic_similarity' in feature_set_names:
print('Using similarity-based features with topic distributions...')
features += ['topic_%s' % s for s in SIMILARITY_FUNCTIONS]
if 'word_embedding_similarity' in feature_set_names:
print('Using similarity-based features with word embedding '
'representations...')
# jensen-shannon, renyi, bhattacharyya do not work for word embeddings
# as they require positive probabilities
features += ['word_embedding_%s' % s for s in SIMILARITY_FUNCTIONS
if s not in ['jensen-shannon', 'renyi', 'bhattacharyya']]
if 'diversity' in feature_set_names:
print('Using diversity-based features...')
features += DIVERSITY_FEATURES
print('Using %d features.' % (len(features)))
return features
# DIVERSITY-BASED FEATURES
def diversity_feature_name2value(f_name, example, train_term_dist, word2id,
word2vec):
"""
Given a feature name, return the corresponding feature value.
:param f_name: the name of the feature
:param example: the tokenised example document
:param train_term_dist: the term distribution of the training data
:param word2id: the word-to-id mapping
:param word2vec: a mapping of a word to its word vector representation (e.g. GloVe or word2vec)
:return: the value of the corresponding feature
"""
if f_name == 'num_word_types':
return number_of_word_types(example)
if f_name == 'type_token_ratio':
return type_token_ratio(example)
if f_name == 'entropy':
return entropy(example, train_term_dist, word2id)
if f_name == 'simpsons_index':
return simpsons_index(example, train_term_dist, word2id)
if f_name == 'quadratic_entropy':
return quadratic_entropy(example, train_term_dist, word2id, word2vec)
if f_name == 'renyi_entropy':
return renyi_entropy(example, train_term_dist, word2id)
raise ValueError('%s is not a valid feature name.' % f_name)
def number_of_word_types(example):
"""Counts the number of word types of the example."""
return len(set(example))
def type_token_ratio(example):
"""Calculates the type-token ratio of the example."""
return number_of_word_types(example) / len(example)
def entropy(example, train_term_dist, word2id):
"""Calculates Entropy (https://en.wikipedia.org/wiki/Entropy_(information_theory))."""
summed = 0
for word in set(example):
if word in word2id:
p_word = train_term_dist[word2id[word]]
summed += p_word * np.log(p_word)
return - summed
def simpsons_index(example, train_term_dist, word2id):
"""Calculates Simpson's Index (https://en.wikipedia.org/wiki/Diversity_index#Simpson_index)."""
score = np.sum([np.power(train_term_dist[word2id[word]], 2) if word in word2id else 0
for word in set(example)])
return score
def quadratic_entropy(example, train_term_dist, word2id, word2vec):
"""Calculates Quadratic Entropy."""
assert word2vec is not None, ('Error: Word vector representations have to '
'be available for quadratic entropy.')
summed = 0
for word_1 in set(example):
if word_1 not in word2id or word_1 not in word2vec:
continue # continue as the product will be 0
for word_2 in set(example):
if word_2 not in word2id or word_2 not in word2vec:
continue # continue as the product will be 0
p_1 = train_term_dist[word2id[word_1]]
p_2 = train_term_dist[word2id[word_2]]
vec_1 = word2vec[word_1]
vec_2 = word2vec[word_2]
sim = similarity.cosine_similarity(vec_1, vec_2)
summed += sim * p_1 * p_2
return summed
def renyi_entropy(example, domain_term_dist, word2id):
"""Calculates Rényi Entropy (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy)."""
alpha = 0.99
summed = np.sum([np.power(domain_term_dist[word2id[word]], alpha) if word in word2id else 0 for word in set(example)])
if summed == 0:
# 0 if none of the words appear in the dictionary;
# set to a small constant == low prob instead
summed = 0.0001
score = 1 / (1 - alpha) * np.log(summed)
return score