Skip to content

Commit

Permalink
All the baselines are ready to be trained. New group trainer. New doc…
Browse files Browse the repository at this point in the history
…kerfile. Major improvements.
  • Loading branch information
ab3llini committed Oct 9, 2019
1 parent 025abe6 commit 6aa292a
Show file tree
Hide file tree
Showing 46 changed files with 260 additions and 367 deletions.
Empty file modified LICENSE
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
2 changes: 2 additions & 0 deletions resources/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM ab3llini/thesis:latest
RUN pip install tqdm requests fire regex sklearn matplotlib scikit-image keras torch torchvision nltk transformers numpy future Cython tensorboard
8 changes: 8 additions & 0 deletions resources/docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: '3'

services:
thesis:
image: ab3llini/thesis:latest
volumes:
- /home/alberto/PycharmProjects/OpenDomainVQA:/opt/project
shm_size: 60GB
Empty file modified src/datasets/__init__.py
100644 → 100755
Empty file.
39 changes: 22 additions & 17 deletions src/datasets/bert.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import torch
from utilities.vqa.dataset import *
from pytorch_transformers import BertTokenizer
from transformers import BertTokenizer
from datasets.creator import QADatasetCreator
from torch.utils.data import Dataset

Expand All @@ -20,8 +20,6 @@ def __init__(self, tokenizer=None, tr_size=None, ts_size=None, generation_seed=N
self.tokenizer = tokenizer
else:
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer.add_special_tokens(
{'bos_token': '<bos>', 'eos_token': '<eos>', 'sep_token': '<sep>'})

def embed_fn(self, text):
"""
Expand All @@ -35,17 +33,21 @@ def embed_fn(self, text):
def process(self, candidates_tr, candidates_ts):
# Add tokens to separate Q & A

longest_tr, longest_ts = [0], [0]

print('Processing..')
for candidates in [candidates_tr, candidates_ts]:
for candidates, longest in [[candidates_tr, longest_tr], [candidates_ts, longest_ts]]:
for i, sample in tqdm(enumerate(candidates)):
if longest[0] < sample[self.tkn_a_len_idx] + sample[self.tkn_q_len_idx]:
longest[0] = sample[self.tkn_a_len_idx] + sample[self.tkn_q_len_idx]
# Save some information
l_q = sample[self.tkn_q_len_idx] + 2 # + BOS & SEP
l_a = sample[self.tkn_a_len_idx] + 1 # + EOS

# Add BOS, SEP & EOS tokens
sample[self.tkn_q_idx] = [self.tokenizer.bos_token_id] + sample[self.tkn_q_idx] + [
sample[self.tkn_q_idx] = [self.tokenizer.cls_token_id] + sample[self.tkn_q_idx] + [
self.tokenizer.sep_token_id]
sample[self.tkn_a_idx] = sample[self.tkn_a_idx] + [self.tokenizer.eos_token_id]
sample[self.tkn_a_idx] = sample[self.tkn_a_idx] + [self.tokenizer.sep_token_id]
# Concatenate Q+A
sample[self.tkn_q_idx] += sample[self.tkn_a_idx]
# Compute sequence length
Expand All @@ -57,16 +59,18 @@ def process(self, candidates_tr, candidates_ts):
sample[self.tkn_a_len_idx] = [1] * (l_q + l_a) # Replacing answer len with pad mask

# Pad sequences
self.pad_sequences(candidates_tr, axis=1, value=int(self.tokenizer.pad_token_id))
self.pad_sequences(candidates_ts, axis=1, value=int(self.tokenizer.pad_token_id))
candidates_tr = self.pad_sequences(candidates_tr, axis=1, value=int(self.tokenizer.pad_token_id),
maxlen=longest_tr[0])
candidates_ts = self.pad_sequences(candidates_ts, axis=1, value=int(self.tokenizer.pad_token_id),
maxlen=longest_ts[0])

# Pad token type ids
self.pad_sequences(candidates_tr, axis=3, value=1)
self.pad_sequences(candidates_ts, axis=3, value=1)
candidates_tr = self.pad_sequences(candidates_tr, axis=3, value=1, maxlen=longest_tr[0])
candidates_ts = self.pad_sequences(candidates_ts, axis=3, value=1, maxlen=longest_ts[0])

# Pad padding masks
self.pad_sequences(candidates_tr, axis=4, value=0)
self.pad_sequences(candidates_ts, axis=4, value=0)
candidates_tr = self.pad_sequences(candidates_tr, axis=4, value=0, maxlen=longest_tr[0])
candidates_ts = self.pad_sequences(candidates_ts, axis=4, value=0, maxlen=longest_ts[0])

return candidates_tr, candidates_ts

Expand Down Expand Up @@ -99,16 +103,17 @@ def __getitem__(self, item):
token_types = torch.tensor(sample[3]).long()
att_mask = torch.tensor(sample[4]).long()

return identifier, sequence, length, token_types, att_mask
return identifier, sequence, token_types, att_mask, length

def __len__(self):
return self.maxlen


if __name__ == '__main__':
def create():
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_tokenizer.add_special_tokens(
{'bos_token': '<bos>', 'eos_token': '<eos>', 'sep_token': '<sep>'})
destination = resources_path('models', 'baseline', 'answering', 'bert', 'data')
dsc = BertDatasetCreator(tokenizer=bert_tokenizer, tr_size=1000000, ts_size=100000, generation_seed=555)
dsc.create(destination)
dsc.create(destination)

if __name__ == '__main__':
create()
21 changes: 16 additions & 5 deletions src/datasets/captioning.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,14 @@ def embed_fn(self, text):
def process(self, candidates_tr, candidates_ts):

word_freq = Counter()
longest_tr, longest_ts = [0], [0]

# Create embeddings
print('Processing..')
for candidates in [candidates_tr, candidates_ts]:
for candidates, longest in [[candidates_tr, longest_tr], [candidates_ts, longest_ts]]:
for i, sample in tqdm(enumerate(candidates)):
if longest[0] < sample[self.tkn_a_len_idx]:
longest[0] = sample[self.tkn_a_len_idx]
# Compute word frequencies
word_freq.update(sample[self.tkn_a_idx])

Expand All @@ -57,8 +60,11 @@ def process(self, candidates_tr, candidates_ts):
sample[self.tkn_a_len_idx] = len(sample[self.tkn_a_idx])

# Pad sequences
self.pad_sequences(candidates_tr, axis=1, value=word_map['<pad>'])
self.pad_sequences(candidates_ts, axis=1, value=word_map['<pad>'])
print('Padding to size={},{}'.format(longest_tr, longest_ts))
candidates_tr = self.pad_sequences(candidates_tr, axis=self.tkn_a_idx, value=word_map['<pad>'],
maxlen=longest_tr[0])
candidates_ts = self.pad_sequences(candidates_ts, axis=self.tkn_a_idx, value=word_map['<pad>'],
maxlen=longest_ts[0])

# Save word map to a JSON
with open(os.path.join(self.wordmap_location, 'wordmap.json'), 'w') as j:
Expand Down Expand Up @@ -104,7 +110,12 @@ def __len__(self):
return self.maxlen


if __name__ == '__main__':
def create():
nltk.download('punkt')
destination = resources_path('models', 'baseline', 'captioning', 'data')
dsc = CaptioningDatasetCreator(destination, tr_size=1000000, ts_size=100000, generation_seed=555)
dsc = CaptioningDatasetCreator(destination, generation_seed=555)
dsc.create(destination)


if __name__ == '__main__':
create()
23 changes: 18 additions & 5 deletions src/datasets/creator.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,23 @@ def create_candidates(self):
"""

candidates_tr, candidates_ts = [], []
not_rgb = 0

# We are working on Open Domain Answering systems
# Hence we are interested in the longest answers (annotations) first
objects = [[self.qa_objects_tr, candidates_tr, self.vqa_helper_tr],
[self.qa_objects_ts, candidates_ts, self.vqa_helper_ts]]
objects = [[self.qa_objects_tr, candidates_tr, self.vqa_helper_tr, self.i_path_tr],
[self.qa_objects_ts, candidates_ts, self.vqa_helper_ts, self.i_path_ts]]
print('Building auxiliary candidate structures..')
for qa_objects, candidates, vqa_helper in objects:
for qa_objects, candidates, vqa_helper, image_paths in objects:
# c = 0
# Skip if image is not RGB
for qa_object in tqdm(qa_objects):

# Parse object
obj_id, obj_q, obj_as, obj_i = get_qai(qa_object, vqa_helper)
if not check_rgb(image_paths, obj_i):
not_rgb += 10
continue

# Embed the question
q_embed, q_embed_len = self.embed_fn(obj_q)
Expand All @@ -87,6 +93,11 @@ def create_candidates(self):

# Add sample
candidates.append([obj_id, q_embed, q_embed_len, obj_i, prev_a_emb, prev_a_emb_len])
# if c > 100:
# break
# else:
# c += 1
print('Non RGB samples (removed) = {}'.format(not_rgb))

candidates_tr, candidates_ts = np.array(candidates_tr), np.array(candidates_ts)

Expand Down Expand Up @@ -222,14 +233,16 @@ def build(self):
return self.filter_candidates(*candidates)

@staticmethod
def pad_sequences(candidates, axis, value):
def pad_sequences(candidates, axis, value, maxlen):
if not isinstance(candidates, (np.ndarray, np.generic)):
candidates = np.array(candidates)
padded = k_preproc.sequence.pad_sequences(candidates[:, axis], padding='post',
value=value)
value=value, maxlen=maxlen)
for sample, pad in zip(candidates, padded):
sample[axis] = pad

return candidates

def create(self, location):
candidates = self.build()
set_tr, set_ts = self.process(*candidates)
Expand Down
21 changes: 14 additions & 7 deletions src/datasets/gpt2.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import torch
from utilities.vqa.dataset import *
from pytorch_transformers import GPT2Tokenizer
from transformers import GPT2Tokenizer
from datasets.creator import QADatasetCreator
from torch.utils.data import Dataset

Expand All @@ -34,10 +34,13 @@ def embed_fn(self, text):

def process(self, candidates_tr, candidates_ts):
# Add tokens to separate Q & A
longest_tr, longest_ts = [0], [0]

print('Processing..')
for candidates in [candidates_tr, candidates_ts]:
for candidates, longest in [[candidates_tr, longest_tr], [candidates_ts, longest_ts]]:
for i, sample in tqdm(enumerate(candidates)):
if longest[0] < sample[self.tkn_a_len_idx] + sample[self.tkn_q_len_idx]:
longest[0] = sample[self.tkn_a_len_idx] + sample[self.tkn_q_len_idx]
# Add BOS, SEP & EOS tokens
sample[self.tkn_q_idx] = [self.tokenizer.bos_token_id] + sample[self.tkn_q_idx] + [
self.tokenizer.sep_token_id]
Expand All @@ -54,10 +57,10 @@ def process(self, candidates_tr, candidates_ts):
candidates_ts = np.delete(candidates_ts, obj=[self.tkn_a_idx, self.tkn_a_len_idx], axis=1)

# Pad sequences
self.pad_sequences(candidates_tr, axis=1, value=int(self.tokenizer.pad_token_id))

# Pad sequences
self.pad_sequences(candidates_ts, axis=1, value=int(self.tokenizer.pad_token_id))
candidates_tr = self.pad_sequences(candidates_tr, axis=1, value=int(self.tokenizer.pad_token_id),
maxlen=longest_tr[0])
candidates_ts = self.pad_sequences(candidates_ts, axis=1, value=int(self.tokenizer.pad_token_id),
maxlen=longest_ts[0])

return candidates_tr, candidates_ts

Expand Down Expand Up @@ -94,10 +97,14 @@ def __len__(self):
return self.maxlen


if __name__ == '__main__':
def create():
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.add_special_tokens(
{'pad_token': '<pad>', 'bos_token': '<bos>', 'eos_token': '<eos>', 'sep_token': '<sep>'})
destination = resources_path('models', 'baseline', 'answering', 'gpt2', 'data')
dsc = GPT2DatasetCreator(tokenizer=gpt2_tokenizer, tr_size=1000000, ts_size=100000, generation_seed=555)
dsc.create(destination)


if __name__ == '__main__':
create()
Empty file modified src/models/__init__.py
100644 → 100755
Empty file.
Empty file modified src/models/baseline/__init__.py
100644 → 100755
Empty file.
Empty file modified src/models/baseline/answering/__init__.py
100644 → 100755
Empty file.
Empty file modified src/models/baseline/answering/bert/__init__.py
100644 → 100755
Empty file.
31 changes: 16 additions & 15 deletions src/models/baseline/answering/bert/train.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,31 @@
import os

this_path = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(this_path, os.pardir, os.pardir, os.pardir))
root_path = os.path.abspath(os.path.join(this_path, os.pardir, os.pardir, os.pardir, os.pardir))
sys.path.append(root_path)

from pytorch_transformers import BertForMaskedLM
from models.bert import model as bert_model
from transformers import BertForMaskedLM
from utilities.training import *
from torch.utils.tensorboard import SummaryWriter
from torch.optim import Adam
from models.bert import loss as bert_loss
from models.bert.dataset import *
from datasets.bert import *


def bert_logging_fn(out, batch, description):
ret = ''
for s in range(3):
ret += '*' * 25 + '\n'
ret += '{}'.format(description) + '\n'
ret += 'Input = {}\n'.format(tokenizer.decode(batch[0][s].tolist()))
ret += 'Output = {}\n'.format(tokenizer.decode(torch.argmax(out[0][s], dim=1).tolist()))
ret += 'Input = {}\n'.format(bert_tokenizer.decode(batch[0][s].tolist()))
ret += 'Output = {}\n'.format(bert_tokenizer.decode(torch.argmax(out[0][s], dim=1).tolist()))
return ret


if __name__ == '__main__':

def train():
model_basepath = os.path.join('models', 'baseline', 'answering', 'bert')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
decode = lambda text: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
Expand All @@ -39,9 +36,9 @@ def bert_logging_fn(out, batch, description):
model.train()

tr_dataset = BertDataset(directory=resources_path(model_basepath, 'data'),
name='tr_bert_answering')
name='training.pk')
ts_dataset = BertDataset(directory=resources_path(model_basepath, 'data'),
name='ts_bert_answering', split='test')
name='testing.pk', split='test')

learning_rate = 5e-5

Expand All @@ -53,13 +50,17 @@ def bert_logging_fn(out, batch, description):
loss=lambda out, batch: bert_loss.loss_fn(out[0], batch[0]),
lr=learning_rate,
batch_size=64,
batch_extractor=lambda batch: [batch[1], batch[3], batch[4]], # Get rid of the image
batch_extractor=lambda batch: batch[1:-1], # Get rid of id & seq length
epochs=3,
tensorboard=SummaryWriter(log_dir=resources_path(model_basepath, 'runs')),
checkpoint_path=resources_path(model_basepath, 'checkpoints'),
logging_fp=open(resources_path(model_basepath, 'predictions', 'train.txt'), 'w+'),
logging_fp=None,
logging_fn=bert_logging_fn,
logging_interval=10
)

bert_trainer.train()
bert_trainer.train()


if __name__ == '__main__':
train()
Empty file modified src/models/baseline/answering/gpt2/__init__.py
100644 → 100755
Empty file.
Loading

0 comments on commit 6aa292a

Please sign in to comment.