-
Notifications
You must be signed in to change notification settings - Fork 13
/
merge_test_labels.py
54 lines (48 loc) · 1.85 KB
/
merge_test_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Script for merging the test data with ground truth labels."""
import os
import csv
import glovar
if __name__ == '__main__':
# make sure the data is there
test_only_data_path = os.path.join(glovar.ARCT_DIR, 'test-only-data.txt')
truth_data_path = os.path.join(glovar.ARCT_DIR, 'truth.txt')
test_data_path = os.path.join(glovar.ARCT_DIR, 'test-original.csv')
if not os.path.exists(test_only_data_path):
raise ValueError('Missing text-only-data.txt in data dir. '
'Run prepare.sh.')
if not os.path.exists(truth_data_path):
raise ValueError('Missing truth.txt in data dir. Run prepare.sh.')
# grab the labels from the truth file
with open(truth_data_path, 'r') as f:
lines = f.readlines()
# there are some comment lines we don't want
# also strip off the endlines
lines = [l.strip() for l in lines if not l.startswith('#')]
ids = []
labels = []
# there is one line that is split with three spaces
for line in lines:
if '\t' in line:
_id, label = line.split('\t')
else: # this is the only other case
_id, label = line.split(' ')
ids.append(_id)
labels.append(int(label))
label_dict = dict(zip(ids, labels))
# load the test data
rows = []
with open(test_only_data_path, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for row in reader:
rows.append(row)
# merge 'em
rows[0].append('correctLabelW0orW1')
for row in rows[1:]:
_id = row[0]
label = label_dict[_id]
row.append(label)
# save output
with open(test_data_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter='\t')
for row in rows:
writer.writerow(row)