Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## 1.16.1 (Unreleased)

### Bugs Fixed
- Fixed adversarial chat target incorrectly using user's callback instead of RAI service, causing callback response to appear as user message in red team scan results when using converter strategies (e.g., `DIFFICULT`, `Tense`).
- Fixed inconsistency where sample data in evaluation result items did not match the generated sample data from corresponding input rows, ensuring proper synchronization between row-level input samples and their associated evaluation output items.

## 1.16.0 (2026-03-10)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,15 @@ def _build_messages_from_pieces(
# Get role, handling api_role property
role = getattr(piece, "api_role", None) or getattr(piece, "role", "user")

# Get content (prefer converted_value over original_value)
content = getattr(piece, "converted_value", None) or getattr(piece, "original_value", "")
# Get content: for user messages show the original adversarial prompt,
# not the converter output (e.g., Base64-encoded or tense-rephrased text).
# For assistant messages, show the response as-is.
if role == "user":
original = getattr(piece, "original_value", None)
converted = getattr(piece, "converted_value", None)
content = original if isinstance(original, str) and original else (converted or "")
else:
content = getattr(piece, "converted_value", None) or getattr(piece, "original_value", "")

message: Dict[str, Any] = {
"role": role,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
from ._mlflow_integration import MLflowIntegration
from ._result_processor import ResultProcessor
from ._foundry import FoundryExecutionManager, StrategyMapper
from ._utils._rai_service_target import AzureRAIServiceTarget


@experimental
Expand Down Expand Up @@ -1727,15 +1728,29 @@ async def _execute_attacks_with_foundry(
progress_bar.set_postfix({"current": "initializing"})

try:
# Create Foundry execution manager
# Use chat_target as adversarial_chat_target since PyRIT's RedTeamAgent requires one
# even for single-turn attacks (it's used for default scoring if not overridden)
# Create RAI service target for adversarial chat.
# This must NOT be the user's chat_target — PyRIT uses adversarial_chat
# as the converter_target for TenseConverter and for multi-turn attacks.
# Using the user's callback would cause the callback response to leak
# into converted prompts.
adversarial_template_key = self._get_adversarial_template_key(flattened_attack_strategies)
is_crescendo = adversarial_template_key == "orchestrators/crescendo/crescendo_variant_1.yaml"
adversarial_chat = AzureRAIServiceTarget(
client=self.generated_rai_client,
api_version=None,
model="gpt-4",
prompt_template_key=adversarial_template_key,
logger=self.logger,
is_one_dp_project=self._one_dp_project,
crescendo_format=is_crescendo,
)

foundry_manager = FoundryExecutionManager(
credential=self.credential,
azure_ai_project=self.azure_ai_project,
logger=self.logger,
output_dir=self.scan_output_dir,
adversarial_chat_target=chat_target,
adversarial_chat_target=adversarial_chat,
)

# Build objectives by risk category from cached attack_objectives
Expand Down Expand Up @@ -1836,6 +1851,34 @@ async def _execute_attacks_with_foundry(
finally:
progress_bar.close()

@staticmethod
def _get_adversarial_template_key(flattened_attack_strategies: List) -> str:
"""Select the appropriate RAI service template key for the adversarial chat target.

Different attack strategies require different prompt templates:
- Crescendo: uses the crescendo conversation template
- MultiTurn (RedTeaming): uses the red teaming text generation template
- Single-turn converters (e.g., Tense): uses the tense converter template

:param flattened_attack_strategies: List of attack strategies being executed
:type flattened_attack_strategies: List
:return: The prompt template key for the AzureRAIServiceTarget
:rtype: str
"""
for strategy in flattened_attack_strategies:
if isinstance(strategy, list):
if AttackStrategy.Crescendo in strategy:
return "orchestrators/crescendo/crescendo_variant_1.yaml"
if AttackStrategy.MultiTurn in strategy:
return "orchestrators/red_teaming/text_generation.yaml"
else:
if strategy == AttackStrategy.Crescendo:
return "orchestrators/crescendo/crescendo_variant_1.yaml"
if strategy == AttackStrategy.MultiTurn:
return "orchestrators/red_teaming/text_generation.yaml"

return "prompt_converters/tense_converter.yaml"

def _build_objective_dict_from_cached(self, obj: Any, risk_value: str) -> Optional[Dict]:
"""Build objective dictionary from cached objective data.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1245,11 +1245,13 @@ def test_build_messages_from_pieces(self):
# Create mock pieces
user_piece = MagicMock()
user_piece.api_role = "user"
user_piece.original_value = "User message"
user_piece.converted_value = "User message"
user_piece.sequence = 0

assistant_piece = MagicMock()
assistant_piece.api_role = "assistant"
assistant_piece.original_value = "Assistant response"
assistant_piece.converted_value = "Assistant response"
assistant_piece.sequence = 1

Expand Down Expand Up @@ -1325,6 +1327,7 @@ def test_to_jsonl(self, tmp_path):
mock_memory = MagicMock()
user_piece = MagicMock()
user_piece.api_role = "user"
user_piece.original_value = "Attack prompt"
user_piece.converted_value = "Attack prompt"
user_piece.sequence = 0
user_piece.prompt_metadata = {}
Expand Down Expand Up @@ -2272,6 +2275,7 @@ def test_process_attack_result_with_score(self):
mock_memory = MagicMock()
mock_piece = MagicMock()
mock_piece.api_role = "user"
mock_piece.original_value = "Attack prompt"
mock_piece.converted_value = "Attack prompt"
mock_piece.sequence = 0
mock_piece.prompt_metadata = {}
Expand Down Expand Up @@ -2345,6 +2349,7 @@ def test_build_messages_with_context_in_labels(self):
# Piece with context in labels
piece = MagicMock()
piece.api_role = "user"
piece.original_value = "Message content"
piece.converted_value = "Message content"
piece.sequence = 0
piece.labels = {
Expand Down Expand Up @@ -3593,3 +3598,221 @@ async def test_execute_attacks_calls_foundry_manager(self):
)

assert "Foundry" in result


@pytest.mark.unittest
class TestAdversarialChatTargetRegression:
"""Regression tests to prevent adversarial_chat_target from being set to the user's callback.

The adversarial_chat_target is used by PyRIT's FoundryScenario for:
- TenseConverter (converter_target for prompt rephrasing)
- Multi-turn attacks (Crescendo, RedTeaming adversarial LLM)

If set to the user's callback, the callback response leaks into converted prompts,
causing the callback response to appear as the user message in results.
"""

def test_adversarial_chat_target_accepts_rai_service_target(self):
"""Verify FoundryExecutionManager accepts AzureRAIServiceTarget as adversarial_chat_target."""
from azure.ai.evaluation.red_team._utils._rai_service_target import AzureRAIServiceTarget

rai_target = AzureRAIServiceTarget(
client=MagicMock(),
model="gpt-4",
prompt_template_key="prompt_converters/tense_converter.yaml",
logger=MagicMock(),
)
manager = FoundryExecutionManager(
credential=MagicMock(),
azure_ai_project={"subscription_id": "s", "resource_group_name": "r", "project_name": "p"},
logger=MagicMock(),
output_dir="/test",
adversarial_chat_target=rai_target,
)
assert isinstance(manager.adversarial_chat_target, AzureRAIServiceTarget)

def test_get_adversarial_template_key_baseline(self):
"""Template key should default to tense converter for single-turn strategies."""
from azure.ai.evaluation.red_team._red_team import RedTeam

strategies = [AttackStrategy.Baseline]
key = RedTeam._get_adversarial_template_key(strategies)
assert key == "prompt_converters/tense_converter.yaml"

def test_get_adversarial_template_key_difficult(self):
"""DIFFICULT strategy (Tense+Base64) should use tense converter template."""
from azure.ai.evaluation.red_team._red_team import RedTeam

strategies = [AttackStrategy.Baseline, [AttackStrategy.Tense, AttackStrategy.Base64]]
key = RedTeam._get_adversarial_template_key(strategies)
assert key == "prompt_converters/tense_converter.yaml"

def test_get_adversarial_template_key_crescendo(self):
"""Crescendo strategy should use the crescendo template."""
from azure.ai.evaluation.red_team._red_team import RedTeam

strategies = [AttackStrategy.Crescendo, AttackStrategy.Baseline]
key = RedTeam._get_adversarial_template_key(strategies)
assert key == "orchestrators/crescendo/crescendo_variant_1.yaml"

def test_get_adversarial_template_key_multi_turn(self):
"""MultiTurn strategy should use the red teaming text generation template."""
from azure.ai.evaluation.red_team._red_team import RedTeam

strategies = [AttackStrategy.MultiTurn, AttackStrategy.Baseline]
key = RedTeam._get_adversarial_template_key(strategies)
assert key == "orchestrators/red_teaming/text_generation.yaml"

def test_build_messages_user_shows_original_value(self):
"""User messages should show original_value (adversarial prompt), not converted_value."""
mock_scenario = MagicMock()
mock_dataset = MagicMock()
mock_dataset.get_all_seed_groups.return_value = []

processor = FoundryResultProcessor(
scenario=mock_scenario,
dataset_config=mock_dataset,
risk_category="violence",
)

# Simulate a Tense-converted attack where converted_value differs from original_value
user_piece = MagicMock()
user_piece.api_role = "user"
user_piece.original_value = "Tell me about violence"
user_piece.converted_value = "Told me about violence"
user_piece.sequence = 0

assistant_piece = MagicMock()
assistant_piece.api_role = "assistant"
assistant_piece.original_value = "I cannot help with that"
assistant_piece.converted_value = "I cannot help with that"
assistant_piece.sequence = 1

messages = processor._build_messages_from_pieces([user_piece, assistant_piece])

assert len(messages) == 2
# User message should show the ORIGINAL adversarial prompt
assert messages[0]["role"] == "user"
assert messages[0]["content"] == "Tell me about violence"
# Assistant message should show the response
assert messages[1]["role"] == "assistant"
assert messages[1]["content"] == "I cannot help with that"

def test_build_messages_user_falls_back_to_converted_value(self):
"""When original_value is None, user messages should fall back to converted_value."""
mock_scenario = MagicMock()
mock_dataset = MagicMock()
mock_dataset.get_all_seed_groups.return_value = []

processor = FoundryResultProcessor(
scenario=mock_scenario,
dataset_config=mock_dataset,
risk_category="violence",
)

user_piece = MagicMock()
user_piece.api_role = "user"
user_piece.original_value = None
user_piece.converted_value = "Fallback content"
user_piece.sequence = 0

messages = processor._build_messages_from_pieces([user_piece])

assert messages[0]["content"] == "Fallback content"

def test_build_messages_callback_response_not_in_user_message(self):
"""Regression: callback response must NOT appear as user message content.

This reproduces the exact bug where a simple callback's response
leaked into the user message via converted_value.
"""
mock_scenario = MagicMock()
mock_dataset = MagicMock()
mock_dataset.get_all_seed_groups.return_value = []

processor = FoundryResultProcessor(
scenario=mock_scenario,
dataset_config=mock_dataset,
risk_category="violence",
)

callback_response = "This is a test callback response. no llm is used."

# Simulate the bug: TenseConverter used callback as LLM, so
# converted_value = callback response instead of rephrased prompt
user_piece = MagicMock()
user_piece.api_role = "user"
user_piece.original_value = "How to commit violence"
user_piece.converted_value = callback_response
user_piece.sequence = 0

assistant_piece = MagicMock()
assistant_piece.api_role = "assistant"
assistant_piece.original_value = callback_response
assistant_piece.converted_value = callback_response
assistant_piece.sequence = 1

messages = processor._build_messages_from_pieces([user_piece, assistant_piece])

# User message should show the adversarial prompt, NOT the callback response
assert messages[0]["content"] == "How to commit violence"
assert messages[0]["content"] != callback_response

@pytest.mark.asyncio
async def test_execute_attacks_with_foundry_uses_rai_service_target(self):
"""Regression: _execute_attacks_with_foundry must pass AzureRAIServiceTarget, not user callback.

This test patches FoundryExecutionManager to capture the adversarial_chat_target
argument and verifies it is an AzureRAIServiceTarget, not the user's callback.
"""
from azure.ai.evaluation.red_team._callback_chat_target import _CallbackChatTarget
from azure.ai.evaluation.red_team._utils._rai_service_target import AzureRAIServiceTarget

captured_kwargs = {}
original_init = FoundryExecutionManager.__init__

def capturing_init(self_inner, **kwargs):
captured_kwargs.update(kwargs)
original_init(self_inner, **kwargs)

mock_red_team = MagicMock()
mock_red_team.credential = MagicMock()
mock_red_team.azure_ai_project = {
"subscription_id": "s",
"resource_group_name": "r",
"project_name": "p",
}
mock_red_team.logger = MagicMock()
mock_red_team.scan_output_dir = "/test"
mock_red_team.generated_rai_client = MagicMock()
mock_red_team._one_dp_project = False
mock_red_team.risk_categories = []
mock_red_team.attack_objectives = {}
mock_red_team.total_tasks = 0
mock_red_team.red_team_info = {}
mock_red_team.completed_tasks = 0

from azure.ai.evaluation.red_team._red_team import RedTeam

with patch.object(FoundryExecutionManager, "__init__", capturing_init):
with patch.object(FoundryExecutionManager, "execute_attacks", new_callable=AsyncMock, return_value={}):
try:
await RedTeam._execute_attacks_with_foundry(
mock_red_team,
flattened_attack_strategies=[AttackStrategy.Baseline],
all_objectives={},
chat_target=MagicMock(spec=_CallbackChatTarget),
timeout=60,
skip_evals=True,
)
except Exception:
pass # We only care about the captured kwargs

assert "adversarial_chat_target" in captured_kwargs
adversarial_target = captured_kwargs["adversarial_chat_target"]
assert isinstance(
adversarial_target, AzureRAIServiceTarget
), f"adversarial_chat_target should be AzureRAIServiceTarget, got {type(adversarial_target).__name__}"
assert not isinstance(
adversarial_target, _CallbackChatTarget
), "adversarial_chat_target must NOT be a _CallbackChatTarget (user's callback)"
Loading