generated from The-Swarm-Corporation/Multi-Agent-Template-App
-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnew.py
395 lines (339 loc) · 12.3 KB
/
new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
import json
import time
from concurrent.futures import ThreadPoolExecutor
from difflib import SequenceMatcher
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol
import numpy as np
from loguru import logger
from pydantic import BaseModel
from scipy import stats
from evalops.function_eval import FunctionCallEvaluator
from evalops.main import StatisticalModelEvaluator
class ModelInterface(Protocol):
"""Protocol defining the required interface for model classes."""
def run(self, task: str, img: str = None) -> str:
"""Run the model on a given task."""
...
class EvalResult(BaseModel):
"""Stores evaluation results for a single model run."""
mean_score: float
sem: float
ci_lower: float
ci_upper: float
raw_scores: List[float]
metadata: Dict[str, Any]
function_call_results: Optional[Dict[str, Any]] = None
sentiment_score: Optional[float] = None
class FunctionCallResult(BaseModel):
"""Stores the evaluation results for function calling tests."""
schema_valid: bool
execution_valid: bool
schema_errors: List[str]
execution_errors: List[str]
matching_score: float
metadata: Dict[str, Any]
class IntegratedModelEvaluator:
"""
Enhanced model evaluator that combines statistical evaluation,
function calling assessment, and sentiment analysis.
"""
def __init__(
self,
cache_dir: Optional[str] = None,
log_level: str = "INFO",
random_seed: Optional[int] = None,
):
# Initialize base statistical evaluator
self.statistical_evaluator = StatisticalModelEvaluator(
cache_dir=cache_dir,
log_level=log_level,
random_seed=random_seed,
)
# Initialize function call evaluator
self.function_evaluator = FunctionCallEvaluator(
self.statistical_evaluator
)
self.cache_dir = Path(cache_dir) if cache_dir else None
if self.cache_dir:
self.cache_dir.mkdir(parents=True, exist_ok=True)
if random_seed is not None:
np.random.seed(random_seed)
logger.add(
lambda msg: print(msg),
level=log_level,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
)
def _calculate_score(
self, prediction: str, correct_answer: str
) -> float:
"""Calculate similarity score between prediction and correct answer."""
prediction = prediction.strip().lower()
correct_answer = correct_answer.strip().lower()
if correct_answer in prediction:
return 1.0
similarity = SequenceMatcher(
None, prediction, correct_answer
).ratio()
return similarity if similarity > 0.8 else 0.0
def _analyze_sentiment(self, text: str) -> float:
"""
Analyze sentiment in text and return score between 0.1 and 1.0.
Basic implementation - could be enhanced with more sophisticated NLP.
"""
# List of positive and negative sentiment words
positive_words = {
"good",
"great",
"excellent",
"amazing",
"wonderful",
"fantastic",
"helpful",
"perfect",
"thank",
"thanks",
"appreciated",
"love",
"nice",
}
negative_words = {
"bad",
"poor",
"terrible",
"horrible",
"useless",
"waste",
"unhelpful",
"wrong",
"fail",
"failed",
"confused",
"disappointing",
}
words = text.lower().split()
pos_count = sum(1 for word in words if word in positive_words)
neg_count = sum(1 for word in words if word in negative_words)
total_count = pos_count + neg_count
if total_count == 0:
return 0.5 # Neutral sentiment
sentiment = (
(pos_count / (pos_count + neg_count))
if total_count > 0
else 0.5
)
# Scale to 0.1-1.0 range
return max(0.1, min(1.0, 0.1 + sentiment * 0.9))
def validate_function_schema(
self, schema: Dict[str, Any]
) -> Dict[str, Any]:
"""Validates if a given function schema follows JSON Schema specification."""
return self.function_evaluator.validate_function_schema(
schema
)
def evaluate_function_call(
self,
function_schema: Dict[str, Any],
test_cases: List[Dict[str, Any]],
expected_outputs: List[Any],
) -> FunctionCallResult:
"""Evaluates function calling implementation against test cases."""
return self.function_evaluator.evaluate_function_call(
function_schema=function_schema,
test_cases=test_cases,
expected_outputs=expected_outputs,
)
def _compare_outputs(self, actual: Any, expected: Any) -> float:
"""Compares actual output with expected output and returns similarity score."""
if isinstance(actual, dict) and isinstance(expected, dict):
actual_keys = set(actual.keys())
expected_keys = set(expected.keys())
key_similarity = len(actual_keys & expected_keys) / len(
expected_keys
)
value_scores = []
for key in actual_keys & expected_keys:
value_scores.append(
self._compare_outputs(actual[key], expected[key])
)
value_similarity = (
sum(value_scores) / len(value_scores)
if value_scores
else 0
)
return (key_similarity + value_similarity) / 2
elif isinstance(actual, (list, tuple)) and isinstance(
expected, (list, tuple)
):
if len(actual) != len(expected):
return 0.5
element_scores = [
self._compare_outputs(a, e)
for a, e in zip(actual, expected)
]
return sum(element_scores) / len(element_scores)
else:
return float(actual == expected)
def evaluate_model(
self,
model: ModelInterface,
questions: List[str],
correct_answers: List[str],
imgs: Optional[List[str]] = None,
cluster_ids: Optional[List[str]] = None,
num_samples: int = 1,
batch_size: int = 32,
cache_key: Optional[str] = None,
function_schema: Optional[Dict[str, Any]] = None,
function_test_cases: Optional[List[Dict[str, Any]]] = None,
function_expected_outputs: Optional[List[Any]] = None,
analyze_sentiment: bool = False,
) -> EvalResult:
"""
Enhanced evaluation that includes statistical analysis, function calling,
and optional sentiment analysis.
"""
start_time = time.time()
# Check cache
if cache_key and self.cache_dir:
cache_path = self.cache_dir / f"{cache_key}.json"
if cache_path.exists():
with open(cache_path) as f:
return EvalResult(**json.load(f))
# Validate inputs
assert len(questions) == len(
correct_answers
), "Questions and answers must have same length"
if cluster_ids:
assert len(cluster_ids) == len(
questions
), "Cluster IDs must match question length"
# Run model predictions
all_scores = []
sentiment_scores = [] if analyze_sentiment else None
with ThreadPoolExecutor() as executor:
for i in range(0, len(questions), batch_size):
batch_questions = questions[i : i + batch_size]
batch_answers = correct_answers[i : i + batch_size]
tasks = [
partial(
self._evaluate_single_question,
model,
q,
a,
num_samples,
)
for q, a in zip(batch_questions, batch_answers)
]
batch_scores = list(
executor.map(lambda f: f(), tasks)
)
all_scores.extend(batch_scores)
if analyze_sentiment:
batch_predictions = [
model.run(q) for q in batch_questions
]
sentiment_scores.extend(
[
self._analyze_sentiment(p)
for p in batch_predictions
]
)
# Calculate statistics
scores_array = np.array(all_scores)
mean_score = np.mean(scores_array)
if cluster_ids:
sem = self._calculate_clustered_sem(
scores_array, cluster_ids
)
else:
sem = stats.sem(scores_array)
ci_lower, ci_upper = stats.norm.interval(
0.95, loc=mean_score, scale=sem
)
# Evaluate function calling if provided
function_results = None
if (
function_schema
and function_test_cases
and function_expected_outputs
):
function_results = self.evaluate_function_call(
function_schema,
function_test_cases,
function_expected_outputs,
).__dict__
# Create result
result = EvalResult(
mean_score=float(mean_score),
sem=float(sem),
ci_lower=float(ci_lower),
ci_upper=float(ci_upper),
raw_scores=all_scores,
metadata={
"num_questions": len(questions),
"num_samples": num_samples,
"has_clusters": cluster_ids is not None,
"evaluation_time": time.time() - start_time,
},
function_call_results=function_results,
sentiment_score=(
np.mean(sentiment_scores)
if sentiment_scores
else None
),
)
# Cache results
if cache_key and self.cache_dir:
cache_path = self.cache_dir / f"{cache_key}.json"
with open(cache_path, "w") as f:
json.dump(result.__dict__, f)
return result
def _calculate_clustered_sem(
self, scores: np.ndarray, cluster_ids: List[str]
) -> float:
"""Calculate clustered standard error of the mean."""
import pandas as pd
df = pd.DataFrame({"score": scores, "cluster": cluster_ids})
cluster_means = df.groupby("cluster")["score"].mean()
n_clusters = len(cluster_means)
cluster_variance = cluster_means.var()
return np.sqrt(cluster_variance / n_clusters)
def create_test_suite(
function_schema: Dict[str, Any], num_cases: int = 10
) -> List[Dict[str, Any]]:
"""Creates a test suite for a given function schema."""
test_cases = []
properties = function_schema["parameters"]["properties"]
for _ in range(num_cases):
test_case = {}
for prop_name, prop_schema in properties.items():
test_case[prop_name] = _generate_test_value(prop_schema)
test_cases.append(test_case)
return test_cases
def _generate_test_value(property_schema: Dict[str, Any]) -> Any:
"""Helper function to generate test values based on property schema."""
schema_type = property_schema.get("type", "string")
if schema_type == "string":
return "test_string"
elif schema_type == "number":
return 42.0
elif schema_type == "integer":
return 42
elif schema_type == "boolean":
return True
elif schema_type == "array":
items_schema = property_schema.get(
"items", {"type": "string"}
)
return [_generate_test_value(items_schema) for _ in range(2)]
elif schema_type == "object":
obj = {}
for prop_name, prop_schema in property_schema.get(
"properties", {}
).items():
obj[prop_name] = _generate_test_value(prop_schema)
return obj
else:
return None