-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
122 lines (104 loc) · 6.39 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pathlib
from constants import categories, all_categories
def calculate_stats(truth_file: str, generated_file: str, output_folder: pathlib.Path | str):
cats_count = {"Governmental Law": {"tp": 0, "fp": 0, "fn": 0, "tn": 0, "precision": 0, "recall": 0, "F1-Score": 0},
"Best Practice": {"tp": 0, "fp": 0, "fn": 0, "tn": 0, "precision": 0, "recall": 0, "F1-Score": 0},
"Business Rule": {"tp": 0, "fp": 0, "fn": 0, "tn": 0, "precision": 0, "recall": 0, "F1-Score": 0},
"Law of Nature": {"tp": 0, "fp": 0, "fn": 0, "tn": 0, "precision": 0, "recall": 0, "F1-Score": 0}}
with open(truth_file, 'r', encoding='utf-8') as f, open(generated_file, 'r', encoding='ISO-8859-1') as g:
if not f or not g:
print("File not found")
return
f.readline(), g.readline() # Skip seperator
# if f.readline() != g.readline():
# print("Files have different columns")
# return
count = 0
for line_truth, line_generated in zip(f, g):
count += 1
truth = (line_truth.strip('\n')).split(";")
generated = (line_generated.strip('\n')).split(";")
# if len(truth) != len(generated):
# print(f"Line {count} has different number of columns")
# return
# check if we compare the same pair of activities
if truth[0] != generated[0]:
print(f"First activity is different in line {count}")
return
if truth[1] != generated[1]:
print(f"Second activity is different in line {count}")
return
# compute confusion matrix values for each category
for i in range(2, len(all_categories) + 2):
# check if both the generated result and the ground truth classify the pair of activities in the current category i (True Positives)
if (truth[i] != "" and truth[i] != "-") and (generated[i] != "" and generated[i] != "-"):
cats_count[all_categories[i - 2]]["tp"] += 1
# check if both the generated result and the ground truth did not classify the pair of activities as the current category i (True Negatives)
elif (truth[i] == "" or truth[i] == "-") and (generated[i] == "" or generated[i] == "-"):
cats_count[all_categories[i - 2]]["tn"] += 1
# check if the generated result did not classify the pair of activities as the current category i but the truth did (False Negatives)
elif (truth[i] != "" and truth[i] != "-") and (generated[i] == "" or generated[i] == "-"):
cats_count[all_categories[i - 2]]["fn"] += 1
# check if the truth did not classify the pair of activities as the current category i but the generated result did (False Positives)
elif (truth[i] == "" or truth[i] == "-") and (generated[i] != "" and generated[i] != "-"):
cats_count[all_categories[i - 2]]["fp"] += 1
# compute precision, recall, and F1-score for each category
for category in cats_count.keys():
cats_count[category]["precision"] = precision(cats_count[category]["tp"], cats_count[category]["fp"])
cats_count[category]["recall"] = recall(cats_count[category]["tp"], cats_count[category]["fn"])
cats_count[category]["F1-Score"] = f1_score(cats_count[category]["tp"], cats_count[category]["fp"], cats_count[category]["fn"])
# compute average precision, recall, and F1-Score for the categories best practice, governmental law, and business rule
average_metrics = {"precision": 0, "recall": 0, "F1-Score": 0}
for metric in average_metrics:
average = 0
for category in categories:
average += cats_count[category][metric]
average_metrics[metric] = average / len(average_metrics)
output_folder = output_folder or pathlib.Path(truth_file).parent / 'stats'
output_folder = pathlib.Path(output_folder)
output_folder.mkdir(parents=True, exist_ok=True)
# if consensus:
# file_name = f"stats/consensus-{pathlib.Path(truth_file).stem}-{pathlib.Path(generated_file).stem}.csv"
# else:
file_name = output_folder / f"{pathlib.Path(generated_file).stem}.csv"
print(file_name)
with open(file_name, "w") as stats_file:
stats_file.write("Category,TP,FN,FP,TN,Precision,Recall,F1\n")
for cat in all_categories:
stats_file.write(f"{cat},")
stats_file.write(f"{cats_count[cat]['tp']},")
stats_file.write(f"{cats_count[cat]['fn']},")
stats_file.write(f"{cats_count[cat]['fp']},")
stats_file.write(f"{cats_count[cat]['tn']},")
stats_file.write(f"{precision(cats_count[cat]['tp'], cats_count[cat]['fp']):.3f},")
stats_file.write(f"{recall(cats_count[cat]['tp'], cats_count[cat]['fn']):.3f},")
stats_file.write(f"{f1_score(cats_count[cat]['tp'], cats_count[cat]['fp'], cats_count[cat]['fn']):.3f}\n")
stats_file.write("Average (without law of nature),")
stats_file.write(f"-,")
stats_file.write(f"-,")
stats_file.write(f"-,")
stats_file.write(f"-,")
stats_file.write(f"{average_metrics['precision']:.3f},")
stats_file.write(f"{average_metrics['recall']:.3f},")
stats_file.write(f"{average_metrics['F1-Score']:.3f}\n")
def precision(tp: int, fp: int):
try:
return tp / (tp + fp)
except ZeroDivisionError:
print("No positive predictions")
return 1.000
def recall(tp: int, fn: int):
try:
return tp / (tp + fn)
except ZeroDivisionError:
print("No positive cases")
return 1.000
def f1_score(tp: int, fp: int, fn: int):
try:
return (2 * tp) / (2 * tp + fp + fn)
except ZeroDivisionError:
print("No positive cases or predictions")
return 1.000
# Example usage
if __name__ == '__main__':
calculate_stats('./thesis_process/truth.csv', './thesis_process/results/vanilla/rag/claude-3-5-haiku-latest/thesis_process-14012025-224724.csv', method='vanilla')