From e38414188eacbe1411ee3ba050a1c781cd45fd48 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 11 Jun 2026 18:54:38 -0700 Subject: [PATCH 1/4] Relocate targeted_harm_categories to AttackResult Phase 14 of the models refactor: restore targeted_harm_categories as a first-class field on AttackResult (regression from #1951, which deleted it outright instead of relocating it). - Add targeted_harm_categories: list[str] to AttackResult - Add nullable JSON column to AttackResultEntry with read/write round-trip - New Alembic migration c3d5e7f9a1b2 (no backfill; source column already dropped and 0.15.0 is unreleased dev) - Auto-populate from SeedGroup.harm_categories via AttackParameters - Stamp centrally in AttackStrategy success/error paths - Re-add as a first-class (non-deprecated) get_attack_results filter - Tests for round-trip, params capture, central stamping, and filtering Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../executor/attack/core/attack_parameters.py | 7 ++ pyrit/executor/attack/core/attack_strategy.py | 26 +++++++ ...geted_harm_categories_to_attack_results.py | 36 +++++++++ pyrit/memory/memory_interface.py | 20 +++++ pyrit/memory/memory_models.py | 4 + pyrit/models/results/attack_result.py | 5 ++ .../attack/core/test_attack_parameters.py | 18 +++++ .../attack/core/test_attack_strategy.py | 74 +++++++++++++++++++ .../test_interface_attack_results.py | 58 +++++++++++++++ 9 files changed, 248 insertions(+) create mode 100644 pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py diff --git a/pyrit/executor/attack/core/attack_parameters.py b/pyrit/executor/attack/core/attack_parameters.py index f4ef53100a..9760b4eed4 100644 --- a/pyrit/executor/attack/core/attack_parameters.py +++ b/pyrit/executor/attack/core/attack_parameters.py @@ -41,6 +41,10 @@ class AttackParameters: # Additional labels that can be applied to the prompts throughout the attack memory_labels: dict[str, str] | None = field(default_factory=dict) + # Harm categories targeted by this attack, derived from the seed group's + # seeds. Stamped onto the produced AttackResult. + targeted_harm_categories: list[str] = field(default_factory=list) + def __str__(self) -> str: """Return a nicely formatted string representation of the attack parameters.""" lines = [f"{self.__class__.__name__}:"] @@ -138,6 +142,9 @@ async def from_seed_group_async( if "memory_labels" in valid_fields: params["memory_labels"] = {} + if "targeted_harm_categories" in valid_fields: + params["targeted_harm_categories"] = list(seed_group.harm_categories) + # Determine which group to use for extracting prepended_conversation/next_message extraction_group: SeedGroup = seed_group diff --git a/pyrit/executor/attack/core/attack_strategy.py b/pyrit/executor/attack/core/attack_strategy.py index e9363e5338..00ccb57644 100644 --- a/pyrit/executor/attack/core/attack_strategy.py +++ b/pyrit/executor/attack/core/attack_strategy.py @@ -239,6 +239,7 @@ async def _on_post_execute_async( # AttackResultEntry row records its lineage. Outside an orchestrator # _attribution is None and both attribution fields stay None. self._apply_attribution(context=event_data.context, result=event_data.result) + self._apply_targeted_harm_categories(context=event_data.context, result=event_data.result) self._logger.debug(f"Attack execution completed in {execution_time_ms}ms") @@ -275,6 +276,30 @@ def _apply_attribution( attribution_data["parent_eval_hash"] = attribution.parent_eval_hash result.attribution_data = attribution_data + @staticmethod + def _apply_targeted_harm_categories( + *, + context: AttackStrategyContextT, + result: AttackResult, + ) -> None: + """ + Copy the attack's targeted harm categories from its parameters onto the result. + + Reads ``context.params.targeted_harm_categories`` (populated in + ``AttackParameters.from_seed_group_async`` from the SeedGroup's + deduplicated harm categories) and stamps it onto the result so it + round-trips into ``AttackResultEntry``. The read is defensive because + some ``AttackParameters`` subclasses may exclude the field. + + Args: + context: The per-task AttackContext. + result: The AttackResult that is about to be persisted. + """ + params = getattr(context, "params", None) + harm_categories = getattr(params, "targeted_harm_categories", None) + if harm_categories: + result.targeted_harm_categories = list(harm_categories) + def _log_attack_outcome(self, result: AttackResult) -> None: """ Log the outcome of the attack. @@ -342,6 +367,7 @@ async def _on_error_async( # Stamp attribution onto the error result so it is locatable via the # attribution_parent_id foreign key on resume. self._apply_attribution(context=context, result=error_result) + self._apply_targeted_harm_categories(context=context, result=error_result) self._memory.add_attack_results_to_memory(attack_results=[error_result]) diff --git a/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py b/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py new file mode 100644 index 0000000000..4aecfc13f2 --- /dev/null +++ b/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py @@ -0,0 +1,36 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +Add targeted_harm_categories to Attack Results. + +Relocates the ``targeted_harm_categories`` field from the per-piece +``PromptMemoryEntries`` (dropped in ``f1a2b3c4d5e6``) to the attack-scoped +``AttackResultEntries`` table, where it belongs. No backfill: the source +per-piece column is already gone and was never populated by production code. + +Revision ID: c3d5e7f9a1b2 +Revises: b2f4c6a8d1e3 +Create Date: 2026-06-11 17:55:00.000000 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "c3d5e7f9a1b2" +down_revision: str | None = "b2f4c6a8d1e3" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Apply this schema upgrade.""" + op.add_column("AttackResultEntries", sa.Column("targeted_harm_categories", sa.JSON(), nullable=True)) + + +def downgrade() -> None: + """Revert this schema upgrade.""" + op.drop_column("AttackResultEntries", "targeted_harm_categories") diff --git a/pyrit/memory/memory_interface.py b/pyrit/memory/memory_interface.py index ca4529144b..3f9abeae82 100644 --- a/pyrit/memory/memory_interface.py +++ b/pyrit/memory/memory_interface.py @@ -1890,6 +1890,7 @@ def get_attack_results( converter_classes_match: Literal["all", "any"] = "all", has_converters: bool | None = None, labels: dict[str, str | Sequence[str]] | None = None, + targeted_harm_categories: Sequence[str] | None = None, identifier_filters: Sequence[IdentifierFilter] | None = None, scenario_result_id: str | None = None, ) -> Sequence[AttackResult]: @@ -1933,6 +1934,12 @@ def get_attack_results( ["roakey_op_a", "roakey_op_b"]}`` matches attacks where ``operator == "roakey"`` AND (``operation == "roakey_op_a"`` OR ``operation == "roakey_op_b"``). Defaults to None. + targeted_harm_categories (Sequence[str] | None, optional): Filter results by the + harm categories targeted by the attack (stored on + ``AttackResultEntry.targeted_harm_categories``, auto-populated from the + attack's SeedGroup). Returns attacks targeting ANY of the listed categories + (OR logic, case-insensitive). An empty sequence applies no filter. Defaults + to None. identifier_filters (Sequence[IdentifierFilter] | None, optional): A sequence of IdentifierFilter objects that allows filtering by various attack identifier JSON properties. Defaults to None. @@ -2049,6 +2056,19 @@ def get_attack_results( # Use database-specific JSON query method conditions.append(self._get_attack_result_label_condition(labels=effective_labels)) + if targeted_harm_categories: + # Attack-scoped harm-category filter (replaces the removed per-piece filter). + # Matches attacks whose AttackResultEntry.targeted_harm_categories array contains + # ANY of the requested categories. + conditions.append( + self._get_condition_json_array_match( + json_column=AttackResultEntry.targeted_harm_categories, + property_path="$", + array_to_match=list(targeted_harm_categories), + match_mode="any", + ) + ) + if identifier_filters: conditions.extend( self._build_identifier_filter_conditions( diff --git a/pyrit/memory/memory_models.py b/pyrit/memory/memory_models.py index ab467c31a7..bed6dc03a3 100644 --- a/pyrit/memory/memory_models.py +++ b/pyrit/memory/memory_models.py @@ -852,6 +852,7 @@ class AttackResultEntry(Base): outcome_reason (str): Optional reason for the outcome, providing additional context. attack_metadata (dict[str, Any]): Metadata can be included as key-value pairs to provide extra context. labels (dict[str, str]): Optional labels associated with the attack result entry. + targeted_harm_categories (list[str]): Harm categories this attack targeted. pruned_conversation_ids (list[str]): List of conversation IDs that were pruned from the attack. adversarial_chat_conversation_ids (list[str]): List of conversation IDs used for adversarial chat. timestamp (DateTime): The timestamp of the attack result entry. @@ -883,6 +884,7 @@ class AttackResultEntry(Base): outcome_reason = mapped_column(String, nullable=True) attack_metadata: Mapped[dict[str, str | int | float | bool] | None] = mapped_column(JSON, nullable=True) labels: Mapped[dict[str, str] | None] = mapped_column(JSON, nullable=True) + targeted_harm_categories: Mapped[list[str] | None] = mapped_column(JSON, nullable=True) pruned_conversation_ids: Mapped[list[str] | None] = mapped_column(JSON, nullable=True) adversarial_chat_conversation_ids: Mapped[list[str] | None] = mapped_column(JSON, nullable=True) timestamp = mapped_column(UTCDateTime, nullable=False) @@ -949,6 +951,7 @@ def __init__(self, *, entry: AttackResult) -> None: self.outcome_reason = entry.outcome_reason self.attack_metadata = self.filter_json_serializable_metadata(entry.metadata) self.labels = entry.labels or {} + self.targeted_harm_categories = entry.targeted_harm_categories or None # Persist conversation references by type self.pruned_conversation_ids = [ @@ -1076,6 +1079,7 @@ def get_attack_result(self) -> AttackResult: metadata=self.attack_metadata or {}, timestamp=self.timestamp or datetime.now(tz=timezone.utc), labels=self.labels or {}, + targeted_harm_categories=self.targeted_harm_categories or [], error_message=self.error_message, error_type=self.error_type, error_traceback=self.error_traceback, diff --git a/pyrit/models/results/attack_result.py b/pyrit/models/results/attack_result.py index 648c837214..8367674c3f 100644 --- a/pyrit/models/results/attack_result.py +++ b/pyrit/models/results/attack_result.py @@ -94,6 +94,11 @@ class AttackResult(StrategyResult): # labels associated with this attack result labels: dict[str, str] = Field(default_factory=dict) + # Harm categories this attack targeted. Auto-populated from the attack's + # SeedGroup (the deduplicated union of its seeds' harm_categories) when the + # result is produced by an attack strategy. + targeted_harm_categories: list[str] = Field(default_factory=list) + # Error information (populated when attack fails with exception) error_message: str | None = None error_type: str | None = None diff --git a/tests/unit/executor/attack/core/test_attack_parameters.py b/tests/unit/executor/attack/core/test_attack_parameters.py index a0d7ab60ca..da73761688 100644 --- a/tests/unit/executor/attack/core/test_attack_parameters.py +++ b/tests/unit/executor/attack/core/test_attack_parameters.py @@ -44,6 +44,24 @@ async def test_extracts_objective_from_seed_group(self, seed_group_with_objectiv assert params.objective == "Test objective" + async def test_extracts_targeted_harm_categories_from_seed_group(self) -> None: + """Harm categories from the seed group's seeds are captured onto the parameters.""" + objective = SeedObjective(value="Test objective", harm_categories=["violence"]) + prompt = SeedPrompt(value="Test prompt", data_type="text", role="user", harm_categories=["hate", "violence"]) + seed_group = SeedAttackGroup(seeds=[objective, prompt]) + + params = await AttackParameters.from_seed_group_async(seed_group=seed_group) + + assert sorted(params.targeted_harm_categories) == ["hate", "violence"] + + async def test_targeted_harm_categories_empty_when_seed_group_has_none( + self, seed_group_with_objective: SeedAttackGroup + ) -> None: + """When no seed declares harm categories, the parameters list is empty.""" + params = await AttackParameters.from_seed_group_async(seed_group=seed_group_with_objective) + + assert params.targeted_harm_categories == [] + async def test_raises_when_no_objective(self) -> None: """Test that ValueError is raised when SeedAttackGroup has no objective.""" # SeedAttackGroup now validates exactly one objective at construction diff --git a/tests/unit/executor/attack/core/test_attack_strategy.py b/tests/unit/executor/attack/core/test_attack_strategy.py index 7940ccf668..a0eb00b63e 100644 --- a/tests/unit/executor/attack/core/test_attack_strategy.py +++ b/tests/unit/executor/attack/core/test_attack_strategy.py @@ -702,6 +702,80 @@ async def test_on_error_stamps_scenario_attribution_when_present(self, sample_at "parent_collection": "atomic_err", } + async def test_on_post_execute_stamps_targeted_harm_categories(self, sample_attack_result, mock_memory): + """Harm categories from context.params are stamped onto the persisted result.""" + + class TestAttackContext(AttackContext): + pass + + params = AttackParameters( + objective="Test harmful objective", + targeted_harm_categories=["violence", "hate"], + ) + context = TestAttackContext(params=params) + context.start_time = 100.0 + + with patch("pyrit.memory.central_memory.CentralMemory.get_memory_instance", return_value=mock_memory): + handler = _DefaultAttackStrategyEventHandler() + event_data = StrategyEventData( + event=StrategyEvent.ON_POST_EXECUTE, + strategy_name="TestStrategy", + strategy_id="test-id", + context=context, + result=sample_attack_result, + ) + await handler.on_event_async(event_data) + + assert sorted(sample_attack_result.targeted_harm_categories) == ["hate", "violence"] + + async def test_on_post_execute_no_harm_categories_leaves_empty( + self, sample_attack_context, sample_attack_result, mock_memory + ): + """With no harm categories on params, the result's list stays empty.""" + with patch("pyrit.memory.central_memory.CentralMemory.get_memory_instance", return_value=mock_memory): + handler = _DefaultAttackStrategyEventHandler() + sample_attack_context.start_time = 100.0 + + event_data = StrategyEventData( + event=StrategyEvent.ON_POST_EXECUTE, + strategy_name="TestStrategy", + strategy_id="test-id", + context=sample_attack_context, + result=sample_attack_result, + ) + await handler.on_event_async(event_data) + + assert sample_attack_result.targeted_harm_categories == [] + + async def test_on_error_stamps_targeted_harm_categories(self, mock_memory): + """Error AttackResults must also carry the targeted harm categories.""" + + class TestAttackContext(AttackContext): + pass + + params = AttackParameters( + objective="Test harmful objective", + targeted_harm_categories=["self_harm"], + ) + context = TestAttackContext(params=params) + context.start_time = 100.0 + + with patch("pyrit.memory.central_memory.CentralMemory.get_memory_instance", return_value=mock_memory): + handler = _DefaultAttackStrategyEventHandler() + event_data = StrategyEventData( + event=StrategyEvent.ON_ERROR, + strategy_name="TestStrategy", + strategy_id="test-id", + context=context, + error=RuntimeError("boom"), + ) + await handler.on_event_async(event_data) + + call = mock_memory.add_attack_results_to_memory.call_args + persisted = call.kwargs["attack_results"][0] + assert persisted.outcome == AttackOutcome.ERROR + assert persisted.targeted_harm_categories == ["self_harm"] + @pytest.mark.usefixtures("patch_central_database") class TestAttackStrategyIntegration: diff --git a/tests/unit/memory/memory_interface/test_interface_attack_results.py b/tests/unit/memory/memory_interface/test_interface_attack_results.py index 414eef135b..debe035316 100644 --- a/tests/unit/memory/memory_interface/test_interface_attack_results.py +++ b/tests/unit/memory/memory_interface/test_interface_attack_results.py @@ -45,6 +45,7 @@ def create_attack_result( objective_num: int, outcome: AttackOutcome = AttackOutcome.SUCCESS, labels: dict[str, str] | None = None, + targeted_harm_categories: list[str] | None = None, ): """Helper function to create AttackResult.""" return AttackResult( @@ -52,6 +53,7 @@ def create_attack_result( objective=f"Objective {objective_num}", outcome=outcome, labels=labels or {}, + targeted_harm_categories=targeted_harm_categories or [], ) @@ -996,6 +998,62 @@ def test_get_attack_results_by_labels_falls_back_to_conversation_labels(sqlite_i assert len(results) == 0 +# --------------------------------------------------------------------------- +# targeted_harm_categories tests +# --------------------------------------------------------------------------- + + +def test_attack_result_targeted_harm_categories_round_trip(sqlite_instance: MemoryInterface): + """targeted_harm_categories persists onto AttackResultEntry and round-trips back.""" + attack_result = create_attack_result( + "conv_1", 1, AttackOutcome.SUCCESS, targeted_harm_categories=["violence", "hate"] + ) + sqlite_instance.add_attack_results_to_memory(attack_results=[attack_result]) + + stored = sqlite_instance.get_attack_results(conversation_id="conv_1") + assert len(stored) == 1 + assert sorted(stored[0].targeted_harm_categories) == ["hate", "violence"] + + +def test_attack_result_targeted_harm_categories_defaults_empty(sqlite_instance: MemoryInterface): + """An AttackResult with no harm categories round-trips to an empty list.""" + attack_result = create_attack_result("conv_1", 1, AttackOutcome.SUCCESS) + sqlite_instance.add_attack_results_to_memory(attack_results=[attack_result]) + + stored = sqlite_instance.get_attack_results(conversation_id="conv_1") + assert len(stored) == 1 + assert stored[0].targeted_harm_categories == [] + + +def test_get_attack_results_by_targeted_harm_categories(sqlite_instance: MemoryInterface): + """Filtering by targeted_harm_categories matches attacks targeting ANY listed category.""" + attack_results = [ + create_attack_result("conv_1", 1, AttackOutcome.SUCCESS, targeted_harm_categories=["violence"]), + create_attack_result("conv_2", 2, AttackOutcome.FAILURE, targeted_harm_categories=["hate", "violence"]), + create_attack_result("conv_3", 3, AttackOutcome.SUCCESS, targeted_harm_categories=["self_harm"]), + create_attack_result("conv_4", 4, AttackOutcome.SUCCESS), + ] + sqlite_instance.add_attack_results_to_memory(attack_results=attack_results) + + violence = sqlite_instance.get_attack_results(targeted_harm_categories=["violence"]) + assert {r.conversation_id for r in violence} == {"conv_1", "conv_2"} + + # OR across multiple requested categories. + multi = sqlite_instance.get_attack_results(targeted_harm_categories=["self_harm", "hate"]) + assert {r.conversation_id for r in multi} == {"conv_2", "conv_3"} + + # Case-insensitive match. + case = sqlite_instance.get_attack_results(targeted_harm_categories=["VIOLENCE"]) + assert {r.conversation_id for r in case} == {"conv_1", "conv_2"} + + # No match. + assert sqlite_instance.get_attack_results(targeted_harm_categories=["nonexistent"]) == [] + + # Empty sequence applies no filter. + none_filter = sqlite_instance.get_attack_results(targeted_harm_categories=[]) + assert {r.conversation_id for r in none_filter} == {"conv_1", "conv_2", "conv_3", "conv_4"} + + # --------------------------------------------------------------------------- # get_unique_attack_labels tests # --------------------------------------------------------------------------- From 4cf3665c0c87032706ef6f14b28c8556b824376a Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 11 Jun 2026 18:59:15 -0700 Subject: [PATCH 2/4] Make targeted_harm_categories comments self-contained Drop historical/decision narrative from comments (e.g. 'replaces the removed per-piece filter', migration 'where it belongs') in favor of present-tense descriptions of current behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...f9a1b2_add_targeted_harm_categories_to_attack_results.py | 6 ++---- pyrit/memory/memory_interface.py | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py b/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py index 4aecfc13f2..ca150fe4e1 100644 --- a/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py +++ b/pyrit/memory/alembic/versions/c3d5e7f9a1b2_add_targeted_harm_categories_to_attack_results.py @@ -4,10 +4,8 @@ """ Add targeted_harm_categories to Attack Results. -Relocates the ``targeted_harm_categories`` field from the per-piece -``PromptMemoryEntries`` (dropped in ``f1a2b3c4d5e6``) to the attack-scoped -``AttackResultEntries`` table, where it belongs. No backfill: the source -per-piece column is already gone and was never populated by production code. +Adds a nullable JSON ``targeted_harm_categories`` column to the +``AttackResultEntries`` table. No backfill. Revision ID: c3d5e7f9a1b2 Revises: b2f4c6a8d1e3 diff --git a/pyrit/memory/memory_interface.py b/pyrit/memory/memory_interface.py index 3f9abeae82..e03af2461d 100644 --- a/pyrit/memory/memory_interface.py +++ b/pyrit/memory/memory_interface.py @@ -2057,9 +2057,8 @@ def get_attack_results( conditions.append(self._get_attack_result_label_condition(labels=effective_labels)) if targeted_harm_categories: - # Attack-scoped harm-category filter (replaces the removed per-piece filter). - # Matches attacks whose AttackResultEntry.targeted_harm_categories array contains - # ANY of the requested categories. + # Match attacks whose targeted_harm_categories array contains ANY of the + # requested categories. conditions.append( self._get_condition_json_array_match( json_column=AttackResultEntry.targeted_harm_categories, From acd5bbb639d58c93e329c79d81cef36d36d9f2d2 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 11 Jun 2026 19:04:54 -0700 Subject: [PATCH 3/4] Fix list-invariance type error in from_seed_group_async ty flagged SeedGroup(seeds=all_prompts): list[SeedPrompt] is not assignable to the invariant list[SeedUnion] parameter. Build all_prompts as an explicitly-typed list[SeedUnion] display so element types are checked against the union directly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyrit/executor/attack/core/attack_parameters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyrit/executor/attack/core/attack_parameters.py b/pyrit/executor/attack/core/attack_parameters.py index 9760b4eed4..f4d44fa7c9 100644 --- a/pyrit/executor/attack/core/attack_parameters.py +++ b/pyrit/executor/attack/core/attack_parameters.py @@ -10,6 +10,7 @@ from pyrit.models import Message, SeedAttackGroup, SeedGroup if TYPE_CHECKING: + from pyrit.models import SeedUnion from pyrit.prompt_target import PromptTarget from pyrit.score import TrueFalseScorer @@ -171,7 +172,7 @@ async def from_seed_group_async( ) # Merge simulated prompts with existing static prompts from the seed_group - all_prompts = list(seed_group.prompts) + simulated_prompts + all_prompts: list[SeedUnion] = [*seed_group.prompts, *simulated_prompts] # Create a temporary prompts-only SeedGroup for extraction # This group contains only prompts (no objective, no simulated config) From 227a25641143edc23aaf7ffe5dc8f21d72a5599b Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Fri, 12 Jun 2026 10:41:47 -0700 Subject: [PATCH 4/4] MAINT: address PR review comments for targeted_harm_categories relocation - Include targeted_harm_categories in AttackResult.to_dict()/from_dict() shims so the deprecated round-trip does not silently drop the field. - Restore targeted_harm_categories plumbing through the technique-selection chain (SelectorScope -> epsilon_greedy -> compute_technique_stats) so the re-homed get_attack_results filter is reachable. - Update memory docs: drop stale MessagePiece columns and refresh the 1_sqlite_memory schema output to match the current models. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/memory/1_sqlite_memory.ipynb | 11 +++++++---- doc/code/memory/3_memory_data_types.md | 5 +---- pyrit/analytics/technique_analysis.py | 4 ++++ pyrit/models/results/attack_result.py | 2 ++ .../scenarios/adaptive/selectors/epsilon_greedy.py | 1 + .../adaptive/selectors/technique_selector.py | 9 +++++++-- tests/unit/analytics/test_technique_analysis.py | 10 ++++++++++ tests/unit/models/test_attack_result.py | 1 + .../scenarios/adaptive/test_epsilon_greedy.py | 12 ++++++++++++ 9 files changed, 45 insertions(+), 10 deletions(-) diff --git a/doc/code/memory/1_sqlite_memory.ipynb b/doc/code/memory/1_sqlite_memory.ipynb index 96140b45b6..8bd8f19841 100644 --- a/doc/code/memory/1_sqlite_memory.ipynb +++ b/doc/code/memory/1_sqlite_memory.ipynb @@ -35,10 +35,7 @@ " timestamp: DATETIME NOT NULL\n", " labels: JSON NOT NULL\n", " prompt_metadata: JSON NOT NULL\n", - " targeted_harm_categories: JSON NULL\n", " converter_identifiers: JSON NULL\n", - " prompt_target_identifier: JSON NOT NULL\n", - " attack_identifier: JSON NOT NULL\n", " response_error: VARCHAR NULL\n", " original_value_data_type: VARCHAR NOT NULL\n", " original_value: VARCHAR NOT NULL\n", @@ -49,6 +46,12 @@ " original_prompt_id: CHAR(32) NOT NULL\n", " pyrit_version: VARCHAR NULL\n", "\n", + "Table: Conversations\n", + "--------------------\n", + " conversation_id: VARCHAR NOT NULL\n", + " target_identifier: JSON NULL\n", + " pyrit_version: VARCHAR NULL\n", + "\n", "Table: EmbeddingData\n", "--------------------\n", " id: CHAR(32) NOT NULL\n", @@ -98,7 +101,6 @@ " id: CHAR(32) NOT NULL\n", " conversation_id: VARCHAR NOT NULL\n", " objective: VARCHAR NOT NULL\n", - " attack_identifier: JSON NOT NULL\n", " atomic_attack_identifier: JSON NULL\n", " objective_sha256: VARCHAR NULL\n", " last_response_id: CHAR(32) NULL\n", @@ -109,6 +111,7 @@ " outcome_reason: VARCHAR NULL\n", " attack_metadata: JSON NULL\n", " labels: JSON NULL\n", + " targeted_harm_categories: JSON NULL\n", " pruned_conversation_ids: JSON NULL\n", " adversarial_chat_conversation_ids: JSON NULL\n", " timestamp: DATETIME NOT NULL\n", diff --git a/doc/code/memory/3_memory_data_types.md b/doc/code/memory/3_memory_data_types.md index bd78c52f24..71e73a0bf0 100644 --- a/doc/code/memory/3_memory_data_types.md +++ b/doc/code/memory/3_memory_data_types.md @@ -23,11 +23,7 @@ One of the most fundamental data structures in PyRIT is [MessagePiece](../../../ - **`labels`**: Dictionary of labels for categorization and filtering - **`prompt_metadata`**: Component-specific metadata (e.g., blob URIs, document types) - **`converter_identifiers`**: List of converters applied to transform the prompt -- **`scorer_identifier`**: Information about the scorer that evaluated this prompt - **`response_error`**: Error status (e.g., `none`, `blocked`, `processing`) -- **`originator`**: Source of the prompt (`attack`, `converter`, `scorer`, `undefined`) -- **`scores`**: List of `Score` objects associated with this piece -- **`targeted_harm_categories`**: Harm categories associated with the prompt - **`timestamp`**: When the piece was created This rich context allows PyRIT to track the full lifecycle of each interaction, including transformations, targeting, scoring, and error handling. @@ -135,6 +131,7 @@ Scores enable automated evaluation of attack success, content harmfulness, and o - **`outcome_reason`**: Optional explanation for the outcome - **`related_conversations`**: Set of related conversation references - **`metadata`**: Arbitrary metadata about the attack execution +- **`targeted_harm_categories`**: Harm categories this attack targeted, auto-populated from the attack's seed group `AttackResult` objects provide comprehensive reporting on attack campaigns, enabling analysis of red teaming effectiveness and vulnerability identification. diff --git a/pyrit/analytics/technique_analysis.py b/pyrit/analytics/technique_analysis.py index c44c032f4f..b892946113 100644 --- a/pyrit/analytics/technique_analysis.py +++ b/pyrit/analytics/technique_analysis.py @@ -21,6 +21,7 @@ def compute_technique_stats( *, technique_eval_hashes: Sequence[str], scenario_result_id: str | None = None, + targeted_harm_categories: Sequence[str] | None = None, memory: MemoryInterface | None = None, ) -> dict[str, AttackStats]: """ @@ -39,6 +40,8 @@ def compute_technique_stats( Returned dict is keyed by these. scenario_result_id (str | None): Restrict to a single scenario run. Defaults to ``None`` (aggregate across all runs). + targeted_harm_categories (Sequence[str] | None): Restrict to results + whose attack targeted these harm categories. Defaults to ``None``. memory (MemoryInterface | None): Memory backend to query. Defaults to ``CentralMemory.get_memory_instance()``. @@ -54,6 +57,7 @@ def compute_technique_stats( results = memory.get_attack_results( atomic_attack_eval_hashes=list(technique_eval_hashes), scenario_result_id=scenario_result_id, + targeted_harm_categories=targeted_harm_categories, ) requested = set(technique_eval_hashes) diff --git a/pyrit/models/results/attack_result.py b/pyrit/models/results/attack_result.py index 8367674c3f..70ffa50779 100644 --- a/pyrit/models/results/attack_result.py +++ b/pyrit/models/results/attack_result.py @@ -249,6 +249,7 @@ def to_dict(self) -> dict[str, Any]: ), "metadata": self.metadata, "labels": self.labels, + "targeted_harm_categories": self.targeted_harm_categories, "error_message": self.error_message, "error_type": self.error_type, "error_traceback": self.error_traceback, @@ -299,6 +300,7 @@ def from_dict(cls, data: dict[str, Any]) -> AttackResult: }, metadata=data.get("metadata", {}), labels=data.get("labels", {}), + targeted_harm_categories=data.get("targeted_harm_categories", []), error_message=data.get("error_message"), error_type=data.get("error_type"), error_traceback=data.get("error_traceback"), diff --git a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py index bb701a6a60..a415182733 100644 --- a/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/epsilon_greedy.py @@ -132,6 +132,7 @@ async def select_async( stats = compute_technique_stats( technique_eval_hashes=technique_list, scenario_result_id=effective_run_id, + targeted_harm_categories=self._scope.targeted_harm_categories, ) chosen: list[str] = [] diff --git a/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py b/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py index 56b03e26f3..0161e4923b 100644 --- a/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py +++ b/pyrit/scenario/scenarios/adaptive/selectors/technique_selector.py @@ -19,8 +19,9 @@ class SelectorScope: queries when estimating technique success rates. All fields default to "no restriction"; combine fields to narrow the - scope (e.g. current run only). Filter values flow through - ``compute_technique_stats`` to ``MemoryInterface.get_attack_results``. + scope (e.g. current run only, same harm category). Filter values flow + through ``compute_technique_stats`` to + ``MemoryInterface.get_attack_results``. The scope is held by the selector at construction time. The per-call ``scenario_result_id`` is supplied by the dispatcher and is forwarded @@ -37,6 +38,10 @@ class SelectorScope: """Restrict to the dispatcher-supplied ``scenario_result_id`` for the in-flight run. When ``False`` (default), query across all runs.""" + targeted_harm_categories: Sequence[str] | None = None + """Filter to results whose attack targeted these harm categories. + ``None`` means no harm-category filter.""" + @classmethod def all_runs(cls) -> SelectorScope: """ diff --git a/tests/unit/analytics/test_technique_analysis.py b/tests/unit/analytics/test_technique_analysis.py index 20546cd2c8..04b1d94890 100644 --- a/tests/unit/analytics/test_technique_analysis.py +++ b/tests/unit/analytics/test_technique_analysis.py @@ -94,6 +94,7 @@ def test_passes_eval_hashes_to_memory_query(self, _patch_memory): call_kwargs = _patch_memory.get_attack_results.call_args[1] assert call_kwargs["atomic_attack_eval_hashes"] == ["x", "y"] assert call_kwargs["scenario_result_id"] is None + assert call_kwargs["targeted_harm_categories"] is None def test_passes_scenario_result_id_to_memory_query(self, _patch_memory): compute_technique_stats(technique_eval_hashes=["x"], scenario_result_id="run-123") @@ -123,6 +124,15 @@ def test_success_rate_computed(self, _patch_memory): assert stats["a"].success_rate == pytest.approx(0.5) + def test_passes_harm_categories_to_memory_query(self, _patch_memory): + compute_technique_stats( + technique_eval_hashes=["x"], + targeted_harm_categories=["misinformation", "hate"], + ) + + call_kwargs = _patch_memory.get_attack_results.call_args[1] + assert call_kwargs["targeted_harm_categories"] == ["misinformation", "hate"] + def test_injected_memory_bypasses_central_memory(self, _patch_memory): injected = MagicMock() injected.get_attack_results.return_value = [ diff --git a/tests/unit/models/test_attack_result.py b/tests/unit/models/test_attack_result.py index 43e7522052..04a68cb097 100644 --- a/tests/unit/models/test_attack_result.py +++ b/tests/unit/models/test_attack_result.py @@ -290,6 +290,7 @@ def test_to_dict_from_dict_roundtrip(): }, metadata={"model": "gpt-4", "temperature": 0.7}, labels={"category": "violence", "severity": "high"}, + targeted_harm_categories=["violence", "hate"], error_message="partial error", error_type="RuntimeError", error_traceback="Traceback ...\n File ...", diff --git a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py index bfade07b3c..21144721f5 100644 --- a/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py +++ b/tests/unit/scenario/scenarios/adaptive/test_epsilon_greedy.py @@ -137,6 +137,7 @@ async def test_default_scope_passes_none_scenario_result_id(self, mock_compute): # Default scope is all_runs(): the per-call scenario_result_id is dropped. assert mock_compute.call_args.kwargs["scenario_result_id"] is None + assert mock_compute.call_args.kwargs["targeted_harm_categories"] is None @patch(_COMPUTE_PATH, side_effect=_empty_rates) async def test_current_run_scope_forwards_scenario_result_id(self, mock_compute): @@ -145,6 +146,17 @@ async def test_current_run_scope_forwards_scenario_result_id(self, mock_compute) assert mock_compute.call_args.kwargs["scenario_result_id"] == "run-42" + @patch(_COMPUTE_PATH, side_effect=_empty_rates) + async def test_scope_filter_fields_forwarded(self, mock_compute): + scope = SelectorScope( + targeted_harm_categories=["misinformation"], + ) + selector = EpsilonGreedyTechniqueSelector(epsilon=0.0, random_seed=0, scope=scope) + await selector.select_async(technique_identifiers=TECHNIQUES, objective="obj") + + kwargs = mock_compute.call_args.kwargs + assert kwargs["targeted_harm_categories"] == ["misinformation"] + class TestEpsilonGreedyEstimate: def test_estimate_unseen_is_one(self):