diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_dsb_evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/__init__.py similarity index 100% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_dsb_evaluation/__init__.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/__init__.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_dsb_evaluation/_dsb_evaluation.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py similarity index 78% rename from sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_dsb_evaluation/_dsb_evaluation.py rename to sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py index 855daf030b25..e5597ca88377 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_dsb_evaluation/_dsb_evaluation.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py @@ -9,7 +9,7 @@ from datetime import datetime from azure.ai.evaluation._common._experimental import experimental from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from azure.ai.evaluation._evaluators import _content_safety, _protected_material, _groundedness, _relevance, _similarity, _fluency, _xpia +from azure.ai.evaluation._evaluators import _content_safety, _protected_material, _groundedness, _relevance, _similarity, _fluency, _xpia, _coherence from azure.ai.evaluation._evaluate import _evaluate from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult @@ -41,9 +41,9 @@ def _setup_logger(): return logger @experimental -class _DSBEvaluator(Enum): +class _SafetyEvaluator(Enum): ''' - Evaluator types for DSB evaluation. + Evaluator types for Safety evaluation. ''' CONTENT_SAFETY = "content_safety" @@ -52,11 +52,12 @@ class _DSBEvaluator(Enum): RELEVANCE = "relevance" SIMILARITY = "similarity" FLUENCY = "fluency" + COHERENCE = "coherence" INDIRECT_ATTACK = "indirect_attack" DIRECT_ATTACK = "direct_attack" @experimental -class _DSBEvaluation: +class _SafetyEvaluation: def __init__( self, azure_ai_project: dict, @@ -64,7 +65,7 @@ def __init__( model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] ): ''' - Initializes a DSBEvaluation object. + Initializes a SafetyEvaluation object. :param azure_ai_project: A dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'. :type azure_ai_project: Dict[str, str] @@ -116,12 +117,13 @@ def _validate_model_config(model_config: Any): if none_keys: raise ValueError(f"The following keys in model_config must not be None: {', '.join(none_keys)}") - async def _simulate_dsb( + async def _simulate( self, target: Callable, max_conversation_turns: int = 1, max_simulation_results: int = 3, conversation_turns : List[List[Union[str, Dict[str, Any]]]] = [], + tasks: List[str] = [], adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None, source_text: Optional[str] = None, direct_attack: bool = False, @@ -137,6 +139,8 @@ async def _simulate_dsb( :type max_simulation_results: int :param conversation_turns: Predefined conversation turns to simulate. :type conversation_turns: List[List[Union[str, Dict[str, Any]]]] + :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks. + :type tasks: List[str] = [], :param adversarial_scenario: The adversarial scenario to simulate. If None, the non-adversarial Simulator is used. :type adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] :param source_text: The source text to use as grounding document in the simulation. @@ -188,6 +192,7 @@ async def callback( scenario=adversarial_scenario, max_conversation_turns=max_conversation_turns, max_simulation_results=max_simulation_results, + tasks=tasks, conversation_turns=conversation_turns, text=source_text, target=callback, @@ -286,51 +291,99 @@ async def callback( simulator_data_paths["regular"] = data_path return simulator_data_paths + + def _get_scenario( + self, + evaluators: List[_SafetyEvaluator], + num_turns: int = 3, + ) -> Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]: + ''' + Returns the Simulation scenario based on the provided list of SafetyEvaluator. + + :param evaluators: A list of SafetyEvaluator. + :type evaluators: List[SafetyEvaluator] + :param num_turns: The number of turns in a conversation. + :type num_turns: int + ''' + for evaluator in evaluators: + if evaluator in [_SafetyEvaluator.CONTENT_SAFETY, _SafetyEvaluator.DIRECT_ATTACK]: + return ( + AdversarialScenario.ADVERSARIAL_CONVERSATION + if num_turns > 1 + else AdversarialScenario.ADVERSARIAL_QA + ) + if evaluator in [ + _SafetyEvaluator.GROUNDEDNESS, + _SafetyEvaluator.RELEVANCE, + _SafetyEvaluator.SIMILARITY, + _SafetyEvaluator.FLUENCY, + _SafetyEvaluator.COHERENCE, + ]: + return None + if evaluator == _SafetyEvaluator.PROTECTED_MATERIAL: + return AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL + if evaluator == _SafetyEvaluator.INDIRECT_ATTACK: + return AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK + + msg = f"Invalid evaluator: {evaluator}. Supported evaluators: {_SafetyEvaluator.__members__.values()}" + raise EvaluationException( + message=msg, + internal_message=msg, + target=ErrorTarget.UNKNOWN, + category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.USER_ERROR, + ) def _get_evaluators( self, - evaluators: List[_DSBEvaluator], + evaluators: List[_SafetyEvaluator], ) -> Dict[str, Callable]: ''' - Returns a dictionary of evaluators based on the provided list of DSBEvaluator. + Returns a dictionary of evaluators based on the provided list of SafetyEvaluator. - :param evaluators: A list of DSBEvaluator. - :type evaluators: List[DSBEvaluator] + :param evaluators: A list of SafetyEvaluator. + :type evaluators: List[SafetyEvaluator] ''' evaluators_dict = {} for evaluator in evaluators: - if evaluator == _DSBEvaluator.CONTENT_SAFETY: + if evaluator == _SafetyEvaluator.CONTENT_SAFETY: evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator( azure_ai_project=self.azure_ai_project, credential=self.credential ) - elif evaluator == _DSBEvaluator.GROUNDEDNESS: + elif evaluator == _SafetyEvaluator.GROUNDEDNESS: evaluators_dict["groundedness"] = _groundedness.GroundednessEvaluator( model_config=self.model_config, ) - elif evaluator == _DSBEvaluator.PROTECTED_MATERIAL: + elif evaluator == _SafetyEvaluator.PROTECTED_MATERIAL: evaluators_dict["protected_material"] = _protected_material.ProtectedMaterialEvaluator( azure_ai_project=self.azure_ai_project, credential=self.credential ) - elif evaluator == _DSBEvaluator.RELEVANCE: + elif evaluator == _SafetyEvaluator.RELEVANCE: evaluators_dict["relevance"] = _relevance.RelevanceEvaluator( model_config=self.model_config, ) - elif evaluator == _DSBEvaluator.SIMILARITY: + elif evaluator == _SafetyEvaluator.SIMILARITY: evaluators_dict["similarity"] = _similarity.SimilarityEvaluator( model_config=self.model_config, ) - elif evaluator == _DSBEvaluator.FLUENCY: + elif evaluator == _SafetyEvaluator.FLUENCY: evaluators_dict["fluency"] = _fluency.FluencyEvaluator( model_config=self.model_config, ) - elif evaluator == _DSBEvaluator.INDIRECT_ATTACK: + elif evaluator == _SafetyEvaluator.COHERENCE: + evaluators_dict["coherence"] = _coherence.CoherenceEvaluator( + model_config=self.model_config, + ) + elif evaluator == _SafetyEvaluator.INDIRECT_ATTACK: evaluators_dict["indirect_attack"] = _xpia.IndirectAttackEvaluator( azure_ai_project=self.azure_ai_project, credential=self.credential ) - elif evaluator == _DSBEvaluator.DIRECT_ATTACK: - continue + elif evaluator == _SafetyEvaluator.DIRECT_ATTACK: + evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator( + azure_ai_project=self.azure_ai_project, credential=self.credential + ) else: - msg = f"Invalid evaluator: {evaluator}. Supported evaluators are: {_DSBEvaluator.__members__.values()}" + msg = f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}" raise EvaluationException( message=msg, internal_message=msg, @@ -358,23 +411,22 @@ def _check_target_returns_context(target: Callable) -> bool: def _validate_inputs( self, - evaluators: List[_DSBEvaluator], + evaluators: List[_SafetyEvaluator], target: Callable, source_text: Optional[str] = None, - adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None, ): ''' - Validates the inputs provided to the __call__ function of the DSBEvaluation object. - :param evaluators: A list of DSBEvaluator. - :type evaluators: List[DSBEvaluator] + Validates the inputs provided to the __call__ function of the SafetyEvaluation object. + :param evaluators: A list of SafetyEvaluator. + :type evaluators: List[SafetyEvaluator] :param target: The target function to call during the evaluation. :type target: Callable :param source_text: The source text to use as grounding document in the evaluation. :type source_text: Optional[str] :param adversarial_scenario: The adversarial scenario to simulate. :type adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] - ''' - if _DSBEvaluator.GROUNDEDNESS in evaluators and not (self._check_target_returns_context(target) or source_text): + ''' + if _SafetyEvaluator.GROUNDEDNESS in evaluators and not (self._check_target_returns_context(target) or source_text): self.logger.error(f"GroundednessEvaluator requires either source_text or a target function that returns context. Source text: {source_text}, _check_target_returns_context: {self._check_target_returns_context(target)}") msg = "GroundednessEvaluator requires either source_text or a target function that returns context" raise EvaluationException( @@ -383,68 +435,17 @@ def _validate_inputs( target=ErrorTarget.GROUNDEDNESS_EVALUATOR, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR, - ) - - if _DSBEvaluator.INDIRECT_ATTACK in evaluators and adversarial_scenario != AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK: - self.logger.error(f"IndirectAttackEvaluator requires adversarial_scenario to be set to ADVERSARIAL_INDIRECT_JAILBREAK. Adversarial scenario: {adversarial_scenario}") - msg = "IndirectAttackEvaluator requires adversarial_scenario to be set to ADVERSARIAL_INDIRECT_JAILBREAK" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.INDIRECT_ATTACK_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - if evaluators != [_DSBEvaluator.INDIRECT_ATTACK] and adversarial_scenario == AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK: - self.logger.error(f"IndirectAttackEvaluator should be used when adversarial_scenario is set to ADVERSARIAL_INDIRECT_JAILBREAK. Evaluators {evaluators}") - msg = "IndirectAttackEvaluator should be used when adversarial_scenario is set to ADVERSARIAL_INDIRECT_JAILBREAK" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.INDIRECT_ATTACK_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - - if _DSBEvaluator.PROTECTED_MATERIAL in evaluators and adversarial_scenario != AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL: - self.logger.error(f"ProtectedMaterialEvaluator requires adversarial_scenario to be set to ADVERSARIAL_CONTENT_PROTECTED_MATERIAL. Adversarial scenario: {adversarial_scenario}") - msg = "ProtectedMaterialEvaluator requires adversarial_scenario to be set to ADVERSARIAL_CONTENT_PROTECTED_MATERIAL" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - if evaluators != [_DSBEvaluator.PROTECTED_MATERIAL] and adversarial_scenario == AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL: - self.logger.error(f"ProtectedMaterialEvaluator should be used when adversarial_scenario is set to ADVERSARIAL_CONTENT_PROTECTED_MATERIAL. Evaluators: {evaluators}") - msg = "ProtectedMaterialEvaluator should be used when adversarial_scenario is set to ADVERSARIAL_CONTENT_PROTECTED_MATERIAL" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.PROTECTED_MATERIAL_EVALUATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) - if _DSBEvaluator.DIRECT_ATTACK in evaluators and len(evaluators) == 1: - self.logger.error("DirectAttack should be used along with other evaluators") - msg = "DirectAttack should be used along with other evaluators" - raise EvaluationException( - message=msg, - internal_message=msg, - target=ErrorTarget.DIRECT_ATTACK_SIMULATOR, - category=ErrorCategory.INVALID_VALUE, - blame=ErrorBlame.USER_ERROR, - ) + ) async def __call__( self, - evaluators: List[_DSBEvaluator], + evaluators: List[_SafetyEvaluator], target: Callable, - adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None, - max_conversation_turns: int = 1, - max_simulation_results: int = 3, + evaluation_name: Optional[str] = None, + num_turns : int=1, + num_rows: int = 3, conversation_turns : List[List[Union[str, Dict[str, Any]]]] = [], + tasks: List[str] = [], source_text: Optional[str] = None, data_path: Optional[Union[str, os.PathLike]] = None, jailbreak_data_path: Optional[Union[str, os.PathLike]] = None, @@ -453,18 +454,20 @@ async def __call__( ''' Evaluates the target function based on the provided parameters. - :param evaluators: A list of DSBEvaluator. - :type evaluators: List[_DSBEvaluator] + :param evaluators: A list of SafetyEvaluator. + :type evaluators: List[_SafetyEvaluator] :param target: The target function to call during the evaluation. :type target: Callable - :param adversarial_scenario: The adversarial scenario to simulate. If None, the non-adversarial Simulator is used. - :type adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] - :param max_conversation_turns: The maximum number of turns in a conversation. - :type max_conversation_turns: int - :param max_simulation_results: The maximum number of simulation results to generate. - :type max_simulation_results: int + :param evaluation_name: The display name name of the evaluation. + :type evaluation_name: Optional[str] + :param num_turns: The number of turns in a between the target application and the caller. + :type num_turns: int + :param num_rows: The (maximum) number of rows to generate for evaluation. + :type num_rows: int :param conversation_turns: Predefined conversation turns to simulate. :type conversation_turns: List[List[Union[str, Dict[str, Any]]]] + :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks. + :type tasks: List[str] = [], :param source_text: The source text to use as grounding document in the evaluation. :type source_text: Optional[str] :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run. @@ -475,44 +478,47 @@ async def __call__( :type output_path: Optional[Union[str, os.PathLike]] ''' ## Log inputs - self.logger.info(f"User inputs: evaluators{evaluators}, adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}") + self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}") ## Validate arguments self._validate_inputs( evaluators=evaluators, target=target, source_text=source_text, - adversarial_scenario=adversarial_scenario ) + # Get scenario + adversarial_scenario = self._get_scenario(evaluators) + + ## Get evaluators + evaluators_dict = self._get_evaluators(evaluators) + ## If `data_path` is not provided, run simulator if data_path is None and jailbreak_data_path is None: self.logger.info(f"No data_path provided. Running simulator.") - data_paths = await self._simulate_dsb( + data_paths = await self._simulate( target=target, adversarial_scenario=adversarial_scenario, - max_conversation_turns=max_conversation_turns, - max_simulation_results=max_simulation_results, + max_conversation_turns=num_turns, + max_simulation_results=num_rows, conversation_turns=conversation_turns, + tasks=tasks, source_text=source_text, - direct_attack=_DSBEvaluator.DIRECT_ATTACK in evaluators + direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators ) data_path = data_paths.get("regular", None) jailbreak_data_path = data_paths.get("jailbreak", None) - ## Get evaluators - evaluators_dict = self._get_evaluators(evaluators) - evaluation_results = {} - ## Run evaluation - if _DSBEvaluator.DIRECT_ATTACK in evaluators and jailbreak_data_path: + evaluation_results = {} + if _SafetyEvaluator.DIRECT_ATTACK in evaluators and jailbreak_data_path: self.logger.info(f"Running evaluation for jailbreak data with inputs jailbreak_data_path={jailbreak_data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path=jailbreak_{output_path}, credential={self.credential}") evaluate_outputs_jailbreak = _evaluate.evaluate( data=jailbreak_data_path, evaluators=evaluators_dict, azure_ai_project=self.azure_ai_project, output_path=Path("jailbreak_" + str(output_path)), - credential=self.credential, + evaluation_name=evaluation_name, ) evaluation_results["jailbreak"] = evaluate_outputs_jailbreak @@ -522,9 +528,10 @@ async def __call__( data=data_path, evaluators=evaluators_dict, azure_ai_project=self.azure_ai_project, + evaluation_name=evaluation_name, output_path=output_path, ) - if _DSBEvaluator.DIRECT_ATTACK in evaluators: + if _SafetyEvaluator.DIRECT_ATTACK in evaluators: evaluation_results["regular"] = evaluate_outputs else: return evaluate_outputs diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_safety_evaluation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_safety_evaluation.py new file mode 100644 index 000000000000..b004c9c524d3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_safety_evaluation.py @@ -0,0 +1,144 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +from azure.ai.evaluation._safety_evaluation._safety_evaluation import _SafetyEvaluation, _SafetyEvaluator +from azure.ai.evaluation.simulator import AdversarialScenario, AdversarialScenarioJailbreak, AdversarialSimulator +from azure.ai.evaluation._exceptions import EvaluationException +from azure.ai.evaluation.simulator._utils import JsonLineChatProtocol, JsonLineList +from azure.core.credentials import TokenCredential + +@pytest.fixture +def mock_credential(): + return MagicMock(spec=TokenCredential) + +@pytest.fixture +def mock_model_config_dict_valid(): + return { + "azure_deployment": "test_deployment", + "azure_endpoint": "https://example.azure.com/", + "type": "azure_openai", + } + +@pytest.fixture +def mock_model_config_dict_invalid(): + return { + "type": "azure_openai", + } + +@pytest.fixture +def mock_target(): + def mock_target_fn() -> str: + return "mock response" + return mock_target_fn + +@pytest.fixture +def mock_target_with_context(): + def mock_target_with_context_fn() -> tuple: + return ("mock response", "mock context") + return mock_target_with_context_fn + +@pytest.fixture +def safety_eval(mock_model_config_dict_valid, mock_credential): + return _SafetyEvaluation( + azure_ai_project={"subscription_id": "mock-sub", "resource_group_name": "mock-rg", "project_name": "mock-proj"}, + credential=mock_credential, + model_config=mock_model_config_dict_valid, + ) + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestSafetyEvaluation: + def test_validate_model_config_missing_keys(self, mock_credential, mock_model_config_dict_invalid): + with pytest.raises(ValueError) as exc_info: + _SafetyEvaluation( + azure_ai_project={"subscription_id": "sub", "resource_group_name": "rg", "project_name": "proj"}, + credential=mock_credential, + model_config=mock_model_config_dict_invalid, + ) + assert "missing required keys" in str(exc_info.value) + + def test_get_evaluators_invalid(self, safety_eval): + with pytest.raises(EvaluationException) as exc_info: + safety_eval._get_evaluators([None]) # type: ignore + assert "Invalid evaluator:" in str(exc_info.value) + + def test_get_scenario_invalid(self, safety_eval): + with pytest.raises(EvaluationException) as exc_info: + safety_eval._get_scenario([None]) # type: ignore + assert "Invalid evaluator:" in str(exc_info.value) + + def test_check_target_returns_context_false(self, safety_eval, mock_target): + assert not safety_eval._check_target_returns_context(mock_target) + + def test_check_target_returns_context_true(self, safety_eval, mock_target_with_context): + assert safety_eval._check_target_returns_context(mock_target_with_context) + + def test_validate_inputs_groundedness_no_source(self, safety_eval, mock_target): + with pytest.raises(EvaluationException) as exc_info: + safety_eval._validate_inputs( + evaluators=[_SafetyEvaluator.GROUNDEDNESS], + target=mock_target, + source_text=None, + ) + assert "requires either source_text" in str(exc_info.value) + + @pytest.mark.asyncio + @patch("azure.ai.evaluation.simulator._simulator.Simulator.__call__", new_callable=AsyncMock) + async def test_simulate_no_scenario(self, mock__call__, safety_eval, mock_target): + mock__call__.return_value = [JsonLineChatProtocol({"messages":[]})] + results = await safety_eval._simulate(target=mock_target) + assert isinstance(results, dict) + assert isinstance(results["regular"], str) + + @pytest.mark.asyncio + @patch("azure.ai.evaluation.simulator.DirectAttackSimulator.__init__", return_value=None) + @patch("azure.ai.evaluation.simulator.DirectAttackSimulator.__call__", new_callable=AsyncMock) + async def test_simulate_direct_attack(self, mock_call, mock_init, safety_eval, mock_target): + mock_call.return_value = {"jailbreak":JsonLineList([{"messages": []}]),"regular":JsonLineList([{"messages": []}])} + + results = await safety_eval._simulate( + target=mock_target, + direct_attack=True, + adversarial_scenario=AdversarialScenario.ADVERSARIAL_QA + ) + assert isinstance(results, dict) + assert isinstance(results["regular"], str) + assert isinstance(results["jailbreak"], str) + + + @pytest.mark.asyncio + @patch("azure.ai.evaluation.simulator.IndirectAttackSimulator.__init__", return_value=None) + @patch("azure.ai.evaluation.simulator.IndirectAttackSimulator.__call__", new_callable=AsyncMock) + async def test_simulate_indirect_jailbreak(self, mock_call, mock_init, safety_eval, mock_target): + mock_call.return_value = JsonLineList([{"messages":[]}]) + + results = await safety_eval._simulate( + target=mock_target, + adversarial_scenario=AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK + ) + assert isinstance(results, dict) + assert isinstance(results["regular"], str) + + @pytest.mark.asyncio + @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__init__", return_value=None) + @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__call__", new_callable=AsyncMock) + async def test_simulate_adversarial(self, mock_call, mock_init, safety_eval, mock_target): + mock_call.return_value = JsonLineList([{"messages":[]}]) + results = await safety_eval._simulate( + target=mock_target, + adversarial_scenario=AdversarialScenario.ADVERSARIAL_QA + ) + assert isinstance(results, dict) + assert isinstance(results["regular"], str) + + @pytest.mark.asyncio + @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__init__", return_value=None) + @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__call__", new_callable=AsyncMock) + async def test_simulate_no_results(self, mock_call, mock_init, safety_eval, mock_target): + + mock_call.return_value = None + with pytest.raises(EvaluationException) as exc_info: + results = await safety_eval._simulate( + target=mock_target, + adversarial_scenario=AdversarialScenario.ADVERSARIAL_QA + ) + assert "outputs generated by the simulator" in str(exc_info.value) \ No newline at end of file