-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpost_process_mturk_evaluations.py
222 lines (206 loc) · 10.8 KB
/
post_process_mturk_evaluations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import argparse, os, sys, json, traceback
import pandas as pd
from tqdm import tqdm
import numpy as np
import krippendorff
from statsmodels.stats import inter_rater as irr
import numpy as np
from statistics import mode, StatisticsError
# Class class to index mapping
CLASS_TO_INDEX = {
'Makes sense': 1,
'Sometimes makes sense': 2,
'Does not make sense or Incorrect': 3,
'First part and second part are not related! Or not enough information to judge': 4,
'Unfamiliar to me to judge': 5
}
# Experimental class to index mapping
# CLASS_TO_INDEX = {
# 'Correct': 1,
# 'Sometimes correct': 2,
# 'Incorrect': 3,
# 'Not enough information to judge': 4,
# 'Unfamiliar to me to judge': 5
# }
def calculate_alpha_and_kappa_scores(annotations_df):
""" Calculate alpha score (agreement between reviewers)
"""
two_category_ratings = np.zeros((len(annotations_df), 2))
correct_incorrect_ratings = np.zeros((len(annotations_df), 2))
three_category_ratings = np.zeros((len(annotations_df), 3))
five_category_ratings = np.zeros((len(annotations_df), 5))
for i, row in annotations_df.iterrows():
for j in range(1, 4):
# Correct, Incorrect, Unfamiliar
review = int(float(row[f'review_{j}']))
# Three categories
if review == 1 or review == 2:
three_category_ratings[i, 0] += 1
elif review == 3 or review == 4:
three_category_ratings[i, 1] += 1
else:
three_category_ratings[i, 2] += 1
# Two categories
if review == 1 or review == 2:
two_category_ratings[i, 0] += 1
else:
two_category_ratings[i, 1] += 1
# Correct/Incorrect categories
if review == 1 or review == 2:
correct_incorrect_ratings[i, 0] += 1
elif review == 3 or review == 4:
correct_incorrect_ratings[i, 1] += 1
# Five categories
five_category_ratings[i, int(float(row[f'review_{j}']))-1] += 1
two_category_alpha = krippendorff.alpha(two_category_ratings)
correct_incorrect_alpha = krippendorff.alpha(correct_incorrect_ratings)
three_category_alpha = krippendorff.alpha(three_category_ratings)
five_category_alpha = krippendorff.alpha(five_category_ratings)
return two_category_alpha, correct_incorrect_alpha, three_category_alpha, five_category_alpha
def majority(votes, aws_vote):
majority_vote, absolute_majority_vote = 0, 0
# Check if there is a duplicate to result in majority vote
if len(set(votes)) != len(votes):
majority_vote = mode(votes)
absolute_majority_vote = majority_vote
else:
# If there is no duplicate, we categorize the votes. 1 & 2 are positive, 3 & 4 are negative, 5 is neutral
positive_votes = [vote for vote in votes if vote == 1 or vote == 2]
negative_votes = [vote for vote in votes if vote == 3 or vote == 4]
neutral_votes = [vote for vote in votes if vote == 5]
# If there are more positive votes than negative votes, we assign 1 as majority vote
if len(positive_votes) > len(negative_votes):
majority_vote = 1
absolute_majority_vote = 1
# If there are more negative votes than positive votes, we assign 3 as majority vote
elif len(negative_votes) > len(positive_votes):
majority_vote = 3
absolute_majority_vote = 3
# If there are more neutral votes than positive or negative votes, we assign 5 as majority vote
elif len(neutral_votes) > len(positive_votes) and len(neutral_votes) > len(negative_votes):
majority_vote = 5
absolute_majority_vote = 5
# Else, we assign the AWS vote as majority vote
else:
majority_vote = aws_vote
absolute_majority_vote = 0
return majority_vote, absolute_majority_vote
def update_out_tsv_from_manifest(mturk_path, out_tsv_path):
""" Update the out_tsv file from prepare_generations_for_mturk_evaluation.py with the results from the manifest file
"""
# Get experiments path
experiments_path = '/'.join(mturk_path.split('/')[:-2])
# Extracting folder name
folder_name = [ch for ch in mturk_path.split('/') if ch != ''][-1]
# New folder with annotations folder name under experiments path
annotations_folder = f'{experiments_path}/{folder_name}'
# Create annotations folder if it does not exist
if not os.path.exists(annotations_folder):
os.makedirs(annotations_folder)
# Copy out_tsv_path file to annotations folder
os.system(f'cp {out_tsv_path} {annotations_folder}')
# Updated out_tsv_path
out_tsv_path = f'{annotations_folder}/{os.path.basename(out_tsv_path)}'
# Get metadata key
metadata_key = f'{folder_name}-metadata'
# Get file name of out_tsv_path without extension
out_filename = os.path.basename(out_tsv_path).split('.')[0]
out_folder = os.path.dirname(out_tsv_path)
# Loading output manifest as an array
evaluations_list = []
with open(f'{mturk_path}/manifests/output/output.manifest') as f:
for line in f:
line = json.loads(line)
evaluations_list.append(line)
# Loading output TSV file
out_tsv_df = pd.read_csv(out_tsv_path, sep='\t', header=0)
# Annotations path
annotations_path = f'{mturk_path}/annotations/worker-response/iteration-1'
# Number of folders under annotations path
num_of_annotations = len(os.listdir(annotations_path))
# Iterate over the output manifest and update the output TSV file
# Iterate over loaded TSV file
manifest_index = 0
for index, row in tqdm(out_tsv_df.iterrows()):
# skip if already auto-evaluated
if row['review'] != 0 and (row['review_1'] == row['review_2'] == row['review_3'] == row['review']) and row['flagged_answer']:
continue
# Extracting the source
source = row['full_text']
# Extracting the review
try:
consolidated_class_name = evaluations_list[manifest_index][metadata_key]['class-name']
except:
print(f'Error in {source} under update_out_tsv_from_manifest method!')
traceback.print_exc()
import IPython; IPython. embed(); exit(1)
review = CLASS_TO_INDEX[consolidated_class_name]
# Updating the output TSV file
out_tsv_df.at[index, 'review'] = review
# TODO: Add individual reviews
# Load the individual reviews
# worker_responses = evaluations[source][0][metadata_key]['worker-responses']
annotation_json_folder_path = f'{annotations_path}/{manifest_index}'
annotation_json_files = [f for f in os.listdir(annotation_json_folder_path) if os.path.isfile(os.path.join(annotation_json_folder_path, f))]
# Iterate over the individual reviews
assert len(annotation_json_files) == 1, f'Found more than one or no annotation files for {source}; Update implemetation to handle this case!'
annotation_file_path = f'{annotation_json_folder_path}/{annotation_json_files[0]}'
with open(annotation_file_path) as f:
annotation_json = json.load(f)
answer_jsons = annotation_json['answers']
i = 1 # review index starts from 1
for answer_json in answer_jsons:
# Extracting the review
class_name = answer_json['answerContent']['crowd-classifier']['label']
review = CLASS_TO_INDEX[class_name]
# Updating the output TSV file
out_tsv_df.at[index, f'review_{i}'] = review
i += 1
# Update the manifest index (note that auto-evaluated rows are skipped in the output TSV file only)
manifest_index += 1
# Assert that all annotations are processed
assert manifest_index == num_of_annotations, f'Number of annotations ({num_of_annotations}) does not match the number of processed annotations ({manifest_index})'
# Calculating majority vote
# Add empty columns for majority vote
# majority_vote considers Sagemaker's vote if there is no majority
# absolute_majority_vote sets 0 if there is no majority
out_tsv_df['majority_vote'] = 0
out_tsv_df['absolute_majority_vote'] = 0
for index, row in out_tsv_df.iterrows():
votes = [int(row['review_1']), int(row['review_2']), int(row['review_3'])]
aws_vote = int(row['review'])
majority_vote, absolute_majority_vote = majority(votes, aws_vote)
out_tsv_df.at[index, 'majority_vote'] = majority_vote
out_tsv_df.at[index, 'absolute_majority_vote'] = absolute_majority_vote
# Calculate alpha score (agreement between reviewers) and store in a sepapte txt file
two_category_alpha, correct_incorrect_alpha, three_category_alpha, five_category_alpha = calculate_alpha_and_kappa_scores(out_tsv_df)
# Save the updated output TSV file
out_tsv_df.to_csv(out_tsv_path, sep='\t', index=False)
print(f'Updated {out_tsv_path} with the results from {mturk_path}.')
# Save the information, including the alpha score, in a separate txt file
out_filename_txt = f'{out_filename}.txt'
with open(f'{out_folder}/{out_filename_txt}', 'w') as f:
f.write(f'Two-Categorry Krippendorf Alpha score: {two_category_alpha}\n')
f.write(f'Correct/Incorrect Krippendorf Alpha score: {correct_incorrect_alpha}\n')
f.write(f'Three-Categorry Krippendorf Alpha score: {three_category_alpha}\n')
f.write(f'Five-Categorry Krippendorf Alpha score: {five_category_alpha}\n')
print(f'Wrote the alpha scores to {out_folder}/{out_filename_txt}.')
if __name__ == "__main__":
""" Sample AWS CLI command to download the results. We need to go to mturk results folder and run this command:
aws s3 cp --recursive s3://negated-predicates/self_samples/ ./
aws s3 cp --recursive s3://[folder-path-to-results] ./
"""
""" Correspoding paths and output files:
- self_examples_eval:
Negated:
--mturk_path ./experiments/self_samples_eval/mturk_results/negated-results/self-samples-negated/
--out_tsv ./experiments/self_samples_eval/few_shot_self_samples_to_eval_negated_preds_with_gpt_3_evaluated.tsv
Normal:
--mturk_path ./experiments/self_samples_eval/mturk_results/normal-results/self-samples-negated/
--out_tsv ./experiments/self_samples_eval/few_shot_self_samples_to_eval_with_gpt_3_evaluated.tsv
"""
parser = argparse.ArgumentParser()
parser.add_argument("--mturk_path", type=str, default=f'./experiments/self_samples_eval/mturk_results/negated-results/self-samples-negated/')
parser.add_argument("--out_tsv", type=str, default=f'./experiments/self_samples_eval/few_shot_self_samples_to_eval_negated_preds_with_gpt_3_evaluated.tsv')
args = parser.parse_args()
update_out_tsv_from_manifest(args.mturk_path, args.out_tsv)