-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathccm_training.py
282 lines (214 loc) · 9.3 KB
/
ccm_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from torch import nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForNextSentencePrediction
import torch
from tqdm import tqdm, trange
import io
import numpy as np
import json
import os
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertAdam
from siamesebert_model import SiameseModel
debug_mode = False
n_gpu = 4
root_dir = ''
bert_location = root_dir if len(root_dir) > 0 else '.'
model = BertModel.from_pretrained(bert_location)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
model.to(device)
model.train()
tokenizer = BertTokenizer.from_pretrained(bert_location, do_lower_case=True)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,
lr=2e-5,
warmup=.1)
def tokenize_one_text(text0, text1):
text0= "[CLS] " + text0
tokenized_text0 = tokenizer.tokenize(text0)[:256]
segments_id0 = [0] * len(tokenized_text0)
text1 = "[SEP] " + text1
tokenized_text1 = tokenizer.tokenize(text1)[:511-len(tokenized_text0)]
tokenized_text1 += tokenizer.tokenize("[SEP]")
segments_id1 = [0] * len(tokenized_text1)
tokenized_text = tokenized_text0 + tokenized_text1
padding = [0] * (512-len(tokenized_text))
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) + padding
segments_id = segments_id0 + segments_id1 + padding
input_mask = [1] * len(tokenized_text) + padding
return indexed_tokens, segments_id, input_mask
filename = "train_all_examples_only_pairs.json"
with open(root_dir + filename, "r") as read_file:
all_posts_original = json.load(read_file)
if debug_mode:
all_posts_original = all_posts_original[:1000]
all_post_title = []
all_post_text = []
labels = []
for i in all_posts_original:
all_post_title.append(i["title"])
all_post_text.append(i["text"])
if i["similar"] == 1:
labels.append(0)
else:
labels.append(1)
max_seq_len = 512
all_posts_original = None
del all_posts_original
#tokenize
input_ids = []
segment_ids = []
attention_masks = []
for i in trange(len(all_post_title)):
indexed_tokens, segments_id, input_mask = tokenize_one_text(all_post_title[i],all_post_text[i])
input_ids.append(indexed_tokens)
segment_ids.append(segments_id)
attention_masks.append(input_mask)
all_post_title = None
all_post_text = None
del all_post_title
del all_post_text
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
random_state=42, test_size=0.01)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
random_state=42, test_size=0.01)
train_segment_ids, validation_segment_ids, _, _ = train_test_split(segment_ids, input_ids,
random_state=42, test_size=0.01)
input_ids = None
segment_ids = None
attention_masks = None
del input_ids
del segment_ids
del attention_masks
#convert data to tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
train_segment_ids = torch.tensor(train_segment_ids)
validation_segment_ids = torch.tensor(validation_segment_ids)
batch_size = n_gpu * 6
train_data = TensorDataset(train_inputs, train_masks,train_segment_ids, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks,validation_segment_ids, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
# Validation
print("first evaluation before training")
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()
# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch
for batch in tqdm(validation_dataloader):
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_segment_id, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions
logits = model(b_input_ids, token_type_ids=b_segment_id, attention_mask=b_input_mask)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print(torch.cuda.memory_allocated()/(10**9), "GB")
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs (authors recommend between 2 and 4)
epochs = 2
# trange is a tqdm wrapper around the normal python range
for epoch in trange(epochs, desc="Epoch"):
# Training
# Set our model to training mode (as opposed to evaluation mode)
model.train()
# Tracking variables
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
# Train the data for one epoch
for step, batch in enumerate(tqdm(train_dataloader)):
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_segment_id, b_labels = batch
# Clear out the gradients (by default they accumulate)
optimizer.zero_grad()
# Forward pass
loss = model(b_input_ids, token_type_ids=b_segment_id, attention_mask=b_input_mask, next_sentence_label=b_labels)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
train_loss_set.append(loss.item())
# Backward pass
loss.backward()
# Update parameters and take a step using the computed gradient
optimizer.step()
# Update tracking variables
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
if step % 5000 == 0 and step != 0:
model_fn = "{}Epoch_{}_{}_tropes_all_examples_only.bin".format(root_dir,epoch, step)
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), model_fn)
print("Train loss: {}".format(tr_loss/nb_tr_steps))
print("Train loss: {}".format(tr_loss/nb_tr_steps))
# Validation
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()
# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch
for batch in tqdm(validation_dataloader):
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_segment_id, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions
logits = model(b_input_ids, token_type_ids=b_segment_id, attention_mask=b_input_mask)
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
model_fn = root_dir + "first_epoch_tropes_all_examples_only.bin"
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), model_fn)
#save model
model_fn = root_dir + "tropes_all_examples_only.bin"
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), model_fn)
#save training loss
training_loss_set = [float(i) for i in train_loss_set]
with open(root_dir + "training_losses_2_epoch.json", "w") as write_file:
json.dump(training_loss_set, write_file)