Skip to content

Commit

Permalink
Added Verbosity as an argument
Browse files Browse the repository at this point in the history
  • Loading branch information
djokester committed Jul 9, 2024
1 parent 1ff8201 commit 21dae65
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 40 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ evaluator = GroqEval(api_key=API_KEY)
The evaluator is the central orchestrator that initializes the metrics.

```python
from groqeval.evaluate import GroqEval
metrics = evaluator(metric_name, **kwargs)
```

Expand Down
4 changes: 3 additions & 1 deletion groqeval/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from groqeval.evaluate import GroqEval
from groqeval.evaluate import GroqEval

__all__ = ["GroqEval"]
7 changes: 4 additions & 3 deletions groqeval/metrics/answer_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class AnswerRelevance(BaseMetric):
relevance to the original question, helping to gauge the utility and appropriateness
of the model's responses.
"""
def __init__(self, groq_client: Groq, output: str, prompt: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.output = output
self.prompt = prompt
self.check_data_types(prompt=prompt, output=output)
Expand Down Expand Up @@ -66,13 +66,13 @@ def output_decomposition(self):
{"role": "system", "content": self.output_decomposition_prompt},
{"role": "user", "content": self.output}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Decomposition of the Output into Statements: %s", response.choices[0].message.content)
return Output.model_validate_json(response.choices[0].message.content)

def score_relevance(self):
Expand All @@ -93,6 +93,7 @@ def score_relevance(self):
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Answer Relevance Score: %s", response.choices[0].message.content)
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)

def score(self):
Expand Down
35 changes: 21 additions & 14 deletions groqeval/metrics/base_metric.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import logging
from groq import Groq

class BaseMetric:
"""
The Base Metric class.
"""
def __init__(self, groq_client):
def __init__(self, groq_client: Groq, verbose: bool = None):
self.groq_client = groq_client
self.logger = logging.getLogger(__name__)
if verbose:
self.logger.setLevel(logging.INFO)


def groq_chat_completion(self, messages, model, temperature=0.5, response_format=None):
"""
Expand All @@ -15,27 +22,27 @@ def groq_chat_completion(self, messages, model, temperature=0.5, response_format
temperature=temperature,
response_format=response_format
)
print(chat_completion.choices[0].message.content)
return chat_completion

def check_data_types(self, **kwargs):
"""
Checks for empty strings in the arguments
"""
for key, value in kwargs.items():
if key != "context":
if value == "":
raise ValueError(f"'{key}' cannot be an empty string.")
if not isinstance(value, str):
raise TypeError(f"'{key}' must be a string")
else:
if len(value) == 0:
raise ValueError(f"'{key}' cannot be an empty list.")
if not isinstance(value, list):
raise TypeError(f"'{key}' must be a list of strings")
if key != "verbose":
if key != "context":
if value == "":
raise ValueError(f"'{key}' cannot be an empty string.")
if not isinstance(value, str):
raise TypeError(f"'{key}' must be a string")
else:
if not all(isinstance(item, str) for item in value):
raise TypeError(f"All items in '{key}' must be strings")
if len(value) == 0:
raise ValueError(f"'{key}' cannot be an empty list.")
if not isinstance(value, list):
raise TypeError(f"'{key}' must be a list of strings")
else:
if not all(isinstance(item, str) for item in value):
raise TypeError(f"All items in '{key}' must be strings")



Expand Down
7 changes: 4 additions & 3 deletions groqeval/metrics/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Bias(BaseMetric):
context-driven expressions. This metric ensures that responses maintain a level of
objectivity and are free from prejudiced or skewed perspectives.
"""
def __init__(self, groq_client: Groq, output: str, prompt: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.output = output
self.prompt = prompt
self.check_data_types(prompt=prompt, output=output)
Expand Down Expand Up @@ -70,13 +70,13 @@ def output_decomposition(self):
{"role": "system", "content": self.output_decomposition_prompt},
{"role": "user", "content": self.output}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Decomposition of the Output into Opinions: %s", response.choices[0].message.content)
return Output.model_validate_json(response.choices[0].message.content)

def score_bias(self):
Expand All @@ -97,6 +97,7 @@ def score_bias(self):
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Bias Score: %s", response.choices[0].message.content)
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)

def score(self):
Expand Down
7 changes: 4 additions & 3 deletions groqeval/metrics/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class ContextRelevance(BaseMetric):
to the generator is pertinent and likely to enhance the quality and
accuracy of the generated responses.
"""
def __init__(self, groq_client: Groq, context: List[str], prompt: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, context: List[str], prompt: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.context = context
self.prompt = prompt
self.check_data_types(prompt=prompt, context=context)
Expand Down Expand Up @@ -79,13 +79,13 @@ def context_decomposition(self):
{"role": "system", "content": self.context_decomposition_prompt},
{"role": "user", "content": self.format_retrieved_context}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Decomposition of the Context into Statements: %s", response.choices[0].message.content)
return Context.model_validate_json(response.choices[0].message.content)

def score_relevance(self):
Expand All @@ -110,6 +110,7 @@ def score_relevance(self):
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Context Relevance Score: %s", response.choices[0].message.content)
return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)

def score(self):
Expand Down
8 changes: 4 additions & 4 deletions groqeval/metrics/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Faithfulness(BaseMetric):
content is not only relevant but also accurate and truthful with respect to the given context,
critical for maintaining the integrity and reliability of the model's responses.
"""
def __init__(self, groq_client: Groq, context: List[str], output: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, context: List[str], output: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.context = context
self.output = output
self.check_data_types(context=context, output=output)
Expand Down Expand Up @@ -80,13 +80,13 @@ def output_decomposition(self):
{"role": "system", "content": self.output_decomposition_prompt},
{"role": "user", "content": self.output}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Decomposition of the Output into Claims: %s", response.choices[0].message.content)
return Output.model_validate_json(response.choices[0].message.content)

def score_faithfulness(self):
Expand All @@ -106,13 +106,13 @@ def score_faithfulness(self):
{"role": "system", "content": self.faithfulness_prompt},
{"role": "user", "content": json.dumps({"sentences": [s.string for s in coherent_sentences]}, indent=2)}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Faithfulness Score: %s", response.choices[0].message.content)
return ScoredOutput.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)

def score(self):
Expand Down
7 changes: 4 additions & 3 deletions groqeval/metrics/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class Hallucination(BaseMetric):
This is crucial for ensuring that the generated outputs remain grounded in the provided
context and do not mislead or introduce inaccuracies.
"""
def __init__(self, groq_client: Groq, context: List[str], output: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, context: List[str], output: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.context = context
self.output = output
self.check_data_types(context=context, output=output)
Expand Down Expand Up @@ -89,13 +89,13 @@ def context_decomposition(self):
{"role": "system", "content": self.context_decomposition_prompt},
{"role": "user", "content": self.format_retrieved_context}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Decomposition of the Context into Statements: %s", response.choices[0].message.content)
return Context.model_validate_json(response.choices[0].message.content)

def score_hallucination(self):
Expand All @@ -116,6 +116,7 @@ def score_hallucination(self):
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Hallucination Score: %s", response.choices[0].message.content)
return ScoredContext.model_validate_json(response.choices[0].message.content), json.loads(response.choices[0].message.content)

def score(self):
Expand Down
6 changes: 3 additions & 3 deletions groqeval/metrics/toxicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Toxicity(BaseMetric):
wider consumption, identifying any language that could be considered
insulting, aggressive, or otherwise damaging.
"""
def __init__(self, groq_client: Groq, output: str, prompt: str):
super().__init__(groq_client)
def __init__(self, groq_client: Groq, output: str, prompt: str, **kwargs):
super().__init__(groq_client, kwargs.get('verbose'))
self.output = output
self.prompt = prompt
self.check_data_types(prompt=prompt, output=output)
Expand Down Expand Up @@ -69,13 +69,13 @@ def output_decomposition(self):
{"role": "system", "content": self.output_decomposition_prompt},
{"role": "user", "content": self.output}
]
print(messages)
response = self.groq_chat_completion(
messages=messages,
model="llama3-70b-8192",
temperature=0,
response_format={"type": "json_object"}
)
self.logger.info("Breakdown of the Toxicity Score: %s", response.choices[0].message.content)
return Output.model_validate_json(response.choices[0].message.content)

def score_toxicity(self):
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"groq==0.9.0"
"groq>=0.9.0",
"pydantic>=2.7.4"
]

[tool.twine]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
groq==0.9.0
groq==0.9.0
pydantic==2.7.4
17 changes: 14 additions & 3 deletions tests/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import importlib
from typing import List, Dict
import pytest
from conftest import get_class_args, generate_random_value

def metricize(file_name: str):
Expand All @@ -26,5 +26,16 @@ def test_load_metrics(evaluator, metrics_folder, metrics_module):
class_ = getattr(module, class_name)
class_args = get_class_args(class_)
random_args = {name: generate_random_value(param) for name, param in class_args.items()}
print(class_name, random_args)
assert type(evaluator(module_name, **random_args)) == class_
assert type(evaluator(module_name, **random_args)) == class_

def test_load_base_metric(evaluator, metrics_module):
module_name = "base_metric"
module_path = f'{metrics_module}.{"base_metric"}'
module = importlib.import_module(module_path)
class_name = metricize(module_name)

class_ = getattr(module, class_name)
class_args = get_class_args(class_)
random_args = {name: generate_random_value(param) for name, param in class_args.items()}
with pytest.raises(TypeError, match=f"{class_name} is not a valid metric class"):
base_metric = evaluator(module_name, **random_args)

0 comments on commit 21dae65

Please sign in to comment.