forked from gregdurrett/fp-dataset-artifacts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
monli_extension.py
50 lines (42 loc) · 2.65 KB
/
monli_extension.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json
import argparse
# Read file from monli in jsonl format
# Sample lines:
# {"sentence1": "There is a man not wearing a hat staring at people on a subway.", "sentence2": "There is a man not wearing a sunhat staring at people on a subway.", "gold_label": "entailment", "depth": -1, "sentence1_lex": "hat", "sentence2_lex": "sunhat"}
def read_jsonl_file(filename):
with open(filename, 'r') as f:
for line in f:
yield json.loads(line)
# Desired format of new output:
# {"premise": "the man does not own a dog", "hypothesis": "the man does not own a husky", "label": 0}
# for the label 0 is neutral, 1 is entailment, 2 is contradiction
def generate_new_dataset(in_filename, out_filename):
# Alternate removing the not and following space from the premise and hypothesis
# If the gold_label is neutral then removing the not from the premise should result in the contradiction, but removing from the hypothesis should result in neutral
# If the gold_label is entailment then removing the not from the premise should result in neutral, but removing from the hypothesis should result in contradiction
with open(out_filename, 'w') as f:
for line in read_jsonl_file(in_filename):
if line['gold_label'] == 'neutral':
premise = line['sentence1']
modified_premise = premise.replace('not ', '')
hypothesis = line['sentence2']
modified_hypothesis = hypothesis.replace('not ', '')
# first do the contradiction, then the neutral
f.write(json.dumps({'premise': modified_premise, 'hypothesis': hypothesis, 'label': 2}) + '\n')
f.write(json.dumps({'premise': premise, 'hypothesis': modified_hypothesis, 'label': 0}) + '\n')
elif line['gold_label'] == 'entailment':
premise = line['sentence1']
modified_premise = premise.replace('not ', '')
hypothesis = line['sentence2']
modified_hypothesis = hypothesis.replace('not ', '')
# first do the neutral, then the contradiction
f.write(json.dumps({'premise': modified_premise, 'hypothesis': hypothesis, 'label': 0}) + '\n')
f.write(json.dumps({'premise': premise, 'hypothesis': modified_hypothesis, 'label': 2}) + '\n')
else:
raise Exception('Unknown label: {}'.format(line['gold_label']))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--in_f', type=str, required=True)
parser.add_argument('--out_f', type=str, required=True)
args = parser.parse_args()
generate_new_dataset(args.in_f, args.out_f)