-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathimdb.py
144 lines (114 loc) · 4.19 KB
/
imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import cPickle
import gzip
import os
import numpy
def prepare_data(seqs, labels, maxlen=None):
"""Create the matrices from the datasets.
This pad each sequence to the same lenght: the lenght of the
longuest sequence or maxlen.
if maxlen is set, we will cut all sequence to this maximum
lenght.
This swap the axis!
"""
# x: a list of sentences
lengths = [len(s) for s in seqs]
if maxlen is not None:
new_seqs = []
new_labels = []
new_lengths = []
for l, s, y in zip(lengths, seqs, labels):
if l < maxlen:
new_seqs.append(s)
new_labels.append(y)
new_lengths.append(l)
lengths = new_lengths
labels = new_labels
seqs = new_seqs
if len(lengths) < 1:
return None, None, None
n_samples = len(seqs)
maxlen = numpy.max(lengths)
x = numpy.zeros((maxlen, n_samples)).astype('int64')
x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
for idx, s in enumerate(seqs):
x[:lengths[idx], idx] = s
x_mask[:lengths[idx], idx] = 1.
return x, x_mask, labels
def get_dataset_file(dataset, default_dataset, origin):
'''Look for it as if it was a full path, if not, try local file,
if not try in the data directory.
Download dataset if it is not present
'''
data_dir, data_file = os.path.split(dataset)
if data_dir == "" and not os.path.isfile(dataset):
# Check if dataset is in the data directory.
new_path = os.path.join(
os.path.split(__file__)[0],
dataset
)
if os.path.isfile(new_path) or data_file == default_dataset:
dataset = new_path
if (not os.path.isfile(dataset)) and data_file == default_dataset:
import urllib
print 'Downloading data from %s' % origin
urllib.urlretrieve(origin, dataset)
return dataset
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None):
''' Loads the dataset
:type path: String
:param path: The path to the dataset (here IMDB)
:type n_words: int
:param n_words: The number of word to keep in the vocabulary.
All extra words are set to unknow (1).
:type valid_portion: float
:param valid_portion: The proportion of the full train set used for
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
'''
#############
# LOAD DATA #
#############
# Load the dataset
path = get_dataset_file(
path, "imdb.pkl",
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
if path.endswith(".gz"):
f = gzip.open(path, 'rb')
else:
f = open(path, 'rb')
train_set = cPickle.load(f)
test_set = cPickle.load(f)
f.close()
if maxlen:
new_train_set_x = []
new_train_set_y = []
for x, y in zip(train_set[0], train_set[1]):
if len(x) < maxlen:
new_train_set_x.append(x)
new_train_set_y.append(y)
train_set = (new_train_set_x, new_train_set_y)
del new_train_set_x, new_train_set_y
# split training set into validation set
train_set_x, train_set_y = train_set
n_samples = len(train_set_x)
sidx = numpy.random.permutation(n_samples)
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
train_set = (train_set_x, train_set_y)
valid_set = (valid_set_x, valid_set_y)
def remove_unk(x):
return [[1 if w >= n_words else w for w in sen] for sen in x]
test_set_x, test_set_y = test_set
valid_set_x, valid_set_y = valid_set
train_set_x, train_set_y = train_set
train_set_x = remove_unk(train_set_x)
valid_set_x = remove_unk(valid_set_x)
test_set_x = remove_unk(test_set_x)
train = (train_set_x, train_set_y)
valid = (valid_set_x, valid_set_y)
test = (test_set_x, test_set_y)
return train, valid, test