-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
167 lines (126 loc) · 5.31 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os, json, time
import numpy as np
import torch
from transformers import (
AutoTokenizer,
)
from torch.utils.data import DataLoader, Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
class InputExample(object):
"""
A single training/test example for simple sequence classification.
"""
def __init__(self, guid, text_a, text_b, label, features=[]):
self.guid = guid
self.label = label
self.text_a = text_a
self.text_b = text_b
self.features = features
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
def generate_examples(mode, texts, evidence, labels):
examples = []
for idx in range(len(texts)):
guid = "%s-%s" % (mode, idx)
text_a = texts[idx]
if evidence is not None:
text_b = evidence[idx]
else:
text_b = None
label = labels[idx]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
)
return examples
class Cred_Dataset(Dataset):
def __init__(self, args, mode, cluster_num, task_type, tokenizer=PreTrainedTokenizer):
self.args=args
self.data_path = args.data_path
self.mode = mode
self.cluster_num = cluster_num
self.label_list = [str(i) for i in range(11)]
cached_features_file = os.path.join(
args.cache_dir if args.cache_dir is not None else args.data_dir,
"cached_{}_{}_{}_{}_{}_{}".format(
self.args.ver,
self.cluster_num,
task_type,
self.mode,
tokenizer.__class__.__name__,
str(args.max_seq_len),
),
)
if os.path.exists(cached_features_file):
print("*** Loading features from cached file {}".format(cached_features_file))
(self.features, self.datas, self.idx2docid) = torch.load(cached_features_file)
self.num_data=len(self.features['labels'])
else:
# data load: "./dataset/{}.json".format(mode) : train, valid, test --> "./dataset/train.json"
with open(self.data_path.format(self.args.ver, self.cluster_num, mode), 'r') as fp:
self.datas = json.load(fp)
texts=[]
evidence=[]
labels=[]
self.idx2docid = dict()
for idx, data in enumerate(self.datas):
self.idx2docid[str(idx)] = data
texts.append(self.datas[data]['title'])
if task_type=='shallow_mean' or task_type=='shallow_std':
evidence.append(None)
else:
evidence.append(self.datas[data]['evidence'])
labels.append(str(self.datas[data][task_type]))
assert len(texts) == len(evidence) == len(labels)
self.num_data=len(texts)
examples = generate_examples(self.mode, texts, evidence, labels)
output_mode = "classification"
num_labels = args.num_labels
label_map = {label: i for i, label in enumerate(self.label_list)}
def label_from_example(label):
if output_mode == "classification":
return label_map[label]
elif output_mode == "regression":
return float(label)
raise KeyError(output_mode)
self.labels = [label_from_example(example.label) for example in examples]
self.encodings = tokenizer.batch_encode_plus(
[(example.text_a, example.text_b) if example.text_b else example.text_a for example in examples],
max_length=args.max_seq_len,
padding='max_length',
truncation='longest_first',
return_tensors="pt",
)
self.features = self.encodings
self.features['labels'] = torch.tensor(self.labels)
print("*** Saving features into cached file {}".format(cached_features_file))
torch.save((self.features, self.datas, self.idx2docid), cached_features_file)
def __len__(self):
return len(self.features['labels'])
def __getitem__(self,idx):
item = {key: torch.tensor(val[idx]) for key, val in self.features.items()}
return item
def get_labels(self):
return self.features['labels']
if __name__ == '__main__':
from train import get_args
args = get_args()
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
cache_dir=args.cache_dir,
)
ds = Cred_Dataset(
args = args,
mode='train',
task_type='shallow_mean',
tokenizer=tokenizer,
)
dl = DataLoader(ds, batch_size=args.batch_size)
d = next(iter(dl))
import IPython; IPython.embed(); exit(1)