diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py index e0e4934d76..214c435c02 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py @@ -6,6 +6,8 @@ import logging from pathlib import Path +import numpy as np + module_logger = logging.getLogger(__name__) module_logger.setLevel(logging.INFO) @@ -30,3 +32,16 @@ def get_genai_metric(metric_name, **metric_kwargs): metric = evaluate.load( str(curr_file_dir.joinpath(f'scripts/{metric_name}.py'))) return metric.compute(**metric_kwargs) + + +def get_genai_metric_mean(metric_name, **metric_kwargs): + """Get the mean of the metric from the genai library. + + :param metric_name: The name of the metric. + :type metric_name: str + :param metric_kwargs: The keyword arguments to pass to the metric. + :type metric_kwargs: dict + :return: The mean of the metric. + :rtype: float + """ + return np.mean(get_genai_metric(metric_name, **metric_kwargs)['scores']) diff --git a/responsibleai_text/tests/test_genai_metrics.py b/responsibleai_text/tests/test_genai_metrics.py index 5285d6c623..8cf530e5ad 100644 --- a/responsibleai_text/tests/test_genai_metrics.py +++ b/responsibleai_text/tests/test_genai_metrics.py @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation # Licensed under the MIT License. -from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric +from responsibleai_text.utils.genai_metrics.metrics import ( + get_genai_metric, get_genai_metric_mean) PREDICTIONS = ['This is a prediction'] REFERENCES = ['This is a reference'] @@ -15,69 +16,48 @@ def predict(self, inp): class TestGenAIMetrics: - def test_coherence(self): - metric = get_genai_metric('coherence', - predictions=PREDICTIONS, - references=REFERENCES, + def assert_metrics(self, metric_name, + expected, input_len, + **metric_kwargs): + metric = get_genai_metric(metric_name, **metric_kwargs, wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] + assert metric['scores'] == [expected] - metric = get_genai_metric('coherence', - predictions=PREDICTIONS * 5, - references=REFERENCES * 5, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] * 5 + metric_mean = get_genai_metric_mean(metric_name, **metric_kwargs, + wrapper_model=DummyModelWrapper()) + assert metric_mean == expected - def test_equivalence(self): - metric = get_genai_metric('equivalence', - predictions=PREDICTIONS, - references=REFERENCES, - answers=ANSWERS, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] + kwargs_multi = {k: v * input_len for k, v in metric_kwargs.items()} + metric_multi = get_genai_metric(metric_name, **kwargs_multi, + wrapper_model=DummyModelWrapper()) + assert metric_multi['scores'] == [expected] * input_len - metric = get_genai_metric('equivalence', - predictions=PREDICTIONS * 5, - references=REFERENCES * 5, - answers=ANSWERS * 5, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] * 5 + metric_mean_multi = get_genai_metric_mean( + metric_name, **kwargs_multi, wrapper_model=DummyModelWrapper()) + assert metric_mean_multi == expected - def test_fluency(self): - metric = get_genai_metric('fluency', - predictions=PREDICTIONS, - references=REFERENCES, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] + def test_coherence(self): + self.assert_metrics('coherence', 1, 5, + predictions=PREDICTIONS, + references=REFERENCES) - metric = get_genai_metric('fluency', - predictions=PREDICTIONS * 5, - references=REFERENCES * 5, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] * 5 + def test_equivalence(self): + self.assert_metrics('equivalence', 1, 5, + predictions=PREDICTIONS, + references=REFERENCES, + answers=ANSWERS) - def test_groundedness(self): - metric = get_genai_metric('groundedness', - predictions=PREDICTIONS, - references=REFERENCES, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] + def test_fluency(self): + self.assert_metrics('fluency', 1, 5, + predictions=PREDICTIONS, + references=REFERENCES) - metric = get_genai_metric('groundedness', - predictions=PREDICTIONS * 5, - references=REFERENCES * 5, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] * 5 + def test_groundedness(self): + self.assert_metrics('groundedness', 1, 5, + predictions=PREDICTIONS, + references=REFERENCES) def test_relevance(self): - metric = get_genai_metric('relevance', - predictions=PREDICTIONS, - references=REFERENCES, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] - - metric = get_genai_metric('relevance', - predictions=PREDICTIONS * 5, - references=REFERENCES * 5, - wrapper_model=DummyModelWrapper()) - assert metric['scores'] == [1] * 5 + self.assert_metrics('relevance', 1, 5, + predictions=PREDICTIONS, + references=REFERENCES)