forked from gregdurrett/fp-dataset-artifacts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_evaluation.py
138 lines (111 loc) · 6.58 KB
/
test_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import argparse
import pandas as pd
# Same as eval_evaluation.py
# Instead using pandas dataframe to store the data
def read_evaluation_file(evaluation_file):
lines = []
with open(evaluation_file, 'r') as f:
for line in f:
lines.append(json.loads(line))
return lines
# Load lines into a dataframe
def load_lines(lines):
df = pd.DataFrame(lines)
return df
# Add a column to the dataframe to indicate whether the prediction was correct
def add_correct_column(df):
df['correct'] = df['label'] == df['predicted_label']
return df
# Add a column to the dataframe to indicate whether the premise contains 'not' and another column to indicate whether the hypothesis contains 'not'
def add_not_columns(df):
df['premise_contains_not'] = df['premise'].str.contains(' not ')
df['hypothesis_contains_not'] = df['hypothesis'].str.contains(' not ')
return df
# Add a column to the dataframe to indicate whether the premise and hypothesis both contain 'not'
def add_double_negation_column(df):
df['double_negation'] = df['premise_contains_not'] & df['hypothesis_contains_not']
return df
# Single negation column
def add_single_negation_column(df):
df['single_negation'] = (df['premise_contains_not'] | df['hypothesis_contains_not']) & ~df['double_negation']
return df
# Main function
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--evaluation_file', type=str, required=True)
args = parser.parse_args()
lines = read_evaluation_file(args.evaluation_file)
df = load_lines(lines)
df = add_correct_column(df)
df = add_not_columns(df)
df = add_double_negation_column(df)
df = add_single_negation_column(df)
# Print the number of correct and incorrect predictions
print('Number of correct predictions: {}'.format(df['correct'].sum()))
print('Number of incorrect predictions: {}'.format(len(df) - df['correct'].sum()))
# Print the number of incorrect predictions containing not
print('Number of incorrect predictions containing not: {}'.format(df[df['correct'] == False]['single_negation'].sum()))
# Print the number of incorrect predictions with double negation
print('Number of incorrect predictions with double negation: {}'.format(df[df['correct'] == False]['double_negation'].sum()))
# Print the number of correct predictions containing not
print('Number of correct predictions containing not: {}'.format(df[df['correct'] == True]['single_negation'].sum()))
# Print the number of correct predictions with double negation
print('Number of correct predictions with double negation: {}'.format(df[df['correct'] == True]['double_negation'].sum()))
# Print the percentage of incorrect predictions containing not
print('Percentage of incorrect predictions containing not: {}'.format(df[df['correct'] == False]['single_negation'].sum() / len(df[df['correct'] == False])))
# Print the percentage of incorrect predictions with double negation
print('Percentage of incorrect predictions with double negation: {}'.format(df[df['correct'] == False]['double_negation'].sum() / len(df[df['correct'] == False])))
# Display all of the above stats in a table which is broken down by label
print(df.groupby('label').agg({'correct': ['sum', 'count'], 'single_negation': ['sum'], 'double_negation': ['sum']}))
# Display the above stats in a table which is broken down by label and predicted label
print(df.groupby(['predicted_label']).agg({'correct': ['sum', 'count'], 'single_negation': ['sum'], 'double_negation': ['sum']}))
# Same as above two prints but use percentages instead of sums and counts
print(df.groupby('label').agg({'correct': ['mean'], 'single_negation': ['mean'], 'double_negation': ['mean']}))
print(df.groupby(['predicted_label']).agg({'correct': ['mean'], 'single_negation': ['mean'], 'double_negation': ['mean']}))
# Create table of percentage correct for each label on each of the three sets (with double negation, with single negation, and without negation)
df_double_negation = df[df['double_negation'] == True]
if not df_double_negation.empty:
print("Double Negation")
print(df_double_negation.groupby('label').agg({'correct': ['mean']}))
print(df_double_negation.groupby(['label', 'predicted_label']).agg({'correct': ['mean']}))
# Same two tables as above, now with counts instead of percentages
print(df_double_negation.groupby('label').agg({'correct': ['sum', 'count']}))
print(df_double_negation.groupby(['label', 'predicted_label']).agg({'correct': ['sum', 'count']}))
df_single_negation = df[df['single_negation'] == True]
if not df_single_negation.empty:
print("Single Negation")
print(df_single_negation.groupby('label').agg({'correct': ['mean']}))
print(df_single_negation.groupby(['label', 'predicted_label']).agg({'correct': ['mean']}))
# Same two tables as above, now with counts instead of percentages
print(df_single_negation.groupby('label').agg({'correct': ['sum', 'count']}))
print(df_single_negation.groupby(['label', 'predicted_label']).agg({'correct': ['sum', 'count']}))
df_no_negation = df[df['single_negation'] == False]
if not df_no_negation.empty:
print("No Negation")
print(df_no_negation.groupby('label').agg({'correct': ['mean']}))
print(df_no_negation.groupby(['label', 'predicted_label']).agg({'correct': ['mean']}))
# Same two tables as above, now with counts instead of percentages
print(df_no_negation.groupby('label').agg({'correct': ['sum', 'count']}))
print(df_no_negation.groupby(['label', 'predicted_label']).agg({'correct': ['sum', 'count']}))
# Table of overall accuracy for each category (with double negation, with single negation, and without negation)
print("Category Accuracy")
# Double Negation
if not df_double_negation.empty:
print("Double Negation")
print(df_double_negation['correct'].mean())
# Single Negation
if not df_single_negation.empty:
print("Single Negation")
print(df_single_negation['correct'].mean())
# No Negation
if not df_no_negation.empty:
print("No Negation")
print(df_no_negation['correct'].mean())
# Tabel of overall accuracy for each label, and counts of each label
print("Overall Accuracy")
print(df.groupby('label').agg({'correct': ['mean', 'count']}))
print(df.groupby('predicted_label').agg({'correct': ['mean', 'count']}))
print(df.groupby(['label', 'predicted_label']).agg({'correct': ['mean', 'count']}))
if __name__ == '__main__':
main()