-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
116 lines (91 loc) · 3.46 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import random
import time
from torch.utils.data import Dataset, Dataset, Sampler, ConcatDataset
from transformers import AutoTokenizer
import pandas as pd
import random
import ast
from functools import reduce
import operator
from torch import Tensor
import collections
import torch
import config_lm
def mask_target(file, tokenizer):
sentence = file["target"].split()
sentence[file["masked_index"]] = tokenizer.mask_token
return " ".join(sentence)
def flatten(lis):
return reduce(operator.concat, lis)
def preprocesser(file):
file["masked_index"] = file.apply(
lambda x: random.randrange(0, len(x["target"].split()), 1), axis=1
)
file["target"] = file.apply(
lambda x: self.mask_target(x, tokenizer=self.tokenizer), axis=1
)
file["masked_index_1"] = file.apply(
lambda x: x["masked_index"] + len(x["source_1"].split()), axis=1
)
file["masked_index_2"] = file.apply(
lambda x: x["masked_index"] + len(x["source_2"].split()), axis=1
)
index_names = file[(file["source_1"] == file["source_2"])].index
file.drop(index_names, inplace=True)
l1 = file.apply(
lambda x: x["source_1"] + " " + x["rela"] + " " + x["target"], axis=1
).tolist()
l2 = file.apply(
lambda x: x["source_2"] + " " + x["rela"] + " " + x["target"], axis=1
).tolist()
return l1, l2
class DoubleSynonymsDataset(Dataset):
""" Create a datset for the boylm model """
def __init__(self, triples):
self.tokenizer = AutoTokenizer.from_pretrained(config_lm.models["bert"])
self.list_of_tokens = self.tokenzing(triples)
def __len__(self):
return len(self.list_of_tokens)
def __getitem__(self, idx):
return self.list_of_tokens[idx]
def filtering_ids(self, tokens, msk=None):
masked_indexes = collections.defaultdict(int)
for i, ids in enumerate(tokens["input_ids"]):
try:
masked_indexes[i] = ids.tolist().index(103)
except:
continue
# to deduct the same exaxmples from both datasets
if torch.is_tensor(msk):
mask = msk
else:
mask = [0] * tokens["input_ids"].shape[0]
for i in masked_indexes:
mask[i] = 1
mask = torch.nonzero(torch.tensor(mask))
tokens["input_ids"] = tokens["input_ids"][mask].squeeze()
tokens["token_type_ids"] = tokens["token_type_ids"][mask].squeeze()
tokens["attention_mask"] = tokens["attention_mask"][mask].squeeze()
return tokens, list(masked_indexes.values()), mask
def tokenzing(self, list_of_tokens):
"""
We need labels1 for the unmasked text 1 since the model will do the computation on the labels 1
"""
network_inputs = self.tokenizer(
list_of_tokens,
padding=True,
truncation=True,
return_tensors="pt",
max_length=config_lm.max_length,
)
# filtering the tensors to get tensors that have masks along with the masked index as well
network_inputs, masks_o, include_exmp = self.filtering_ids(network_inputs)
view = list(zip(network_inputs["input_ids"], masks_o))
return view
class ConcatDataset(ConcatDataset):
def __init__(self, *datasets):
self.datasets = datasets
def __getitem__(self, i):
return list(d[i] for d in self.datasets)
def __len__(self):
return min(len(d) for d in self.datasets)