diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index 5f42c0eb84..598d79440e 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -1189,6 +1189,567 @@ def _make_eval_result( ) +class TestEvalRunLossAnalysis: + """Tests for loss analysis integration with EvaluationRun.""" + + def test_evaluation_run_config_accepts_loss_analysis_config(self): + """Tests that EvaluationRunConfig can hold loss_analysis_config.""" + config = common_types.EvaluationRunConfig( + metrics=[], + loss_analysis_config=[ + common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + common_types.LossAnalysisConfig( + metric="multi_turn_tool_use_quality_v1", + candidate="travel-agent", + ), + ], + ) + assert len(config.loss_analysis_config) == 2 + assert config.loss_analysis_config[0].metric == "multi_turn_task_success_v1" + assert config.loss_analysis_config[1].metric == "multi_turn_tool_use_quality_v1" + + def test_evaluation_run_config_loss_analysis_config_optional(self): + """Tests that loss_analysis_config defaults to None when not provided.""" + config = common_types.EvaluationRunConfig(metrics=[]) + assert config.loss_analysis_config is None + + def test_evaluation_run_results_has_loss_analysis_results(self): + """Tests that EvaluationRunResults can hold loss_analysis_results.""" + results = common_types.EvaluationRunResults( + evaluation_set="projects/123/locations/global/evaluationSets/456", + summary_metrics=common_types.SummaryMetric( + metrics={}, total_items=10, failed_items=0 + ), + loss_analysis_results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Hallucination", + l2_category="Hallucination of Action", + ), + item_count=3, + ) + ], + ) + ], + ) + assert len(results.loss_analysis_results) == 1 + assert results.loss_analysis_results[0].clusters[0].item_count == 3 + + def test_evaluation_run_results_loss_analysis_results_optional(self): + """Tests backward compat: loss_analysis_results defaults to None.""" + results = common_types.EvaluationRunResults( + evaluation_set="projects/123/locations/global/evaluationSets/456", + summary_metrics=common_types.SummaryMetric( + metrics={}, total_items=5, failed_items=0 + ), + ) + assert results.loss_analysis_results is None + + def test_evaluation_run_show_displays_loss_analysis_without_map(self): + """Tests show() calls display with eval_item_map=None when no map set.""" + eval_run = common_types.EvaluationRun( + name="projects/123/locations/global/evaluationRuns/test-run", + state="SUCCEEDED", + evaluation_run_results=common_types.EvaluationRunResults( + evaluation_set="projects/123/locations/global/evaluationSets/456", + summary_metrics=common_types.SummaryMetric( + metrics={}, total_items=5, failed_items=0 + ), + loss_analysis_results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="agent-1", + ), + clusters=[], + ) + ], + ), + ) + with mock.patch.object( + _evals_visualization, + "display_loss_analysis_results", + ) as mock_display: + eval_run.show() + mock_display.assert_called_once_with( + eval_run.evaluation_run_results.loss_analysis_results, + eval_item_map=None, + ) + + def test_evaluation_run_show_passes_eval_item_map(self): + """Tests show() passes _eval_item_map to display when set via object.__setattr__.""" + eval_run = common_types.EvaluationRun( + name="projects/123/locations/global/evaluationRuns/test-run", + state="SUCCEEDED", + evaluation_run_results=common_types.EvaluationRunResults( + evaluation_set="projects/123/locations/global/evaluationSets/456", + summary_metrics=common_types.SummaryMetric( + metrics={}, total_items=5, failed_items=0 + ), + loss_analysis_results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="agent-1", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + item_count=1, + examples=[ + common_types.LossExample( + evaluation_item="projects/123/locations/global/evaluationItems/item-1", + ) + ], + ) + ], + ) + ], + ), + ) + # Simulate what get_evaluation_run does: set _eval_item_map via object.__setattr__ + # to bypass pydantic extra='forbid' + test_map = { + "projects/123/locations/global/evaluationItems/item-1": { + "request": { + "prompt": { + "agent_data": { + "turns": [ + { + "events": [ + { + "author": "user", + "content": {"parts": [{"text": "Hello"}]}, + } + ] + } + ] + } + } + } + } + } + object.__setattr__(eval_run, "_eval_item_map", test_map) + + # Verify _eval_item_map is accessible via getattr + assert getattr(eval_run, "_eval_item_map", None) is test_map + + with mock.patch.object( + _evals_visualization, + "display_loss_analysis_results", + ) as mock_display: + eval_run.show() + mock_display.assert_called_once_with( + eval_run.evaluation_run_results.loss_analysis_results, + eval_item_map=test_map, + ) + + def test_evaluation_run_show_no_loss_analysis_does_not_crash(self): + """Tests EvaluationRun.show() works when no loss analysis results.""" + eval_run = common_types.EvaluationRun( + name="projects/123/locations/global/evaluationRuns/test-run", + state="SUCCEEDED", + evaluation_run_results=common_types.EvaluationRunResults( + evaluation_set="projects/123/locations/global/evaluationSets/456", + summary_metrics=common_types.SummaryMetric( + metrics={}, total_items=5, failed_items=0 + ), + ), + ) + with mock.patch.object( + _evals_visualization, + "display_loss_analysis_results", + ) as mock_display: + # Should not crash; loss analysis display should NOT be called + eval_run.show() + mock_display.assert_not_called() + + def test_display_loss_analysis_results_html(self): + """Tests that display_loss_analysis_results produces valid HTML.""" + results = [ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="agent-1", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + description="Agent failed to call the tool.", + ), + item_count=5, + ) + ], + ) + ] + payload_json = json.dumps( + { + "results": [ + r.model_dump(mode="json", exclude_none=True) for r in results + ] + }, + ensure_ascii=False, + ) + html = _evals_visualization._get_loss_analysis_html(payload_json) + # The HTML is a self-contained report with base64-encoded JSON payload + # decoded by JavaScript at runtime. Verify structure, not content. + assert "" in html + assert "Loss Pattern Analysis" in html + # Verify the payload is embedded as base64 in the HTML + payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode("ascii") + assert payload_b64 in html + + def test_enrich_loss_examples_with_eval_item_map(self): + """Tests that _enrich_loss_examples_with_eval_items populates evaluation_result.""" + # Create loss results where examples only have evaluation_item (eval run path) + results = [ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="agent-1", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Invocation", + ), + item_count=2, + examples=[ + common_types.LossExample( + evaluation_item="projects/123/locations/global/evaluationItems/item-1", + failed_rubrics=[ + common_types.FailedRubric( + rubric_id="tool_invocation" + ) + ], + ), + common_types.LossExample( + evaluation_item="projects/123/locations/global/evaluationItems/item-2", + failed_rubrics=[ + common_types.FailedRubric( + rubric_id="tool_invocation" + ) + ], + ), + ], + ) + ], + ) + ] + + # Build an eval_item_map matching the actual eval run data shape: + # - scenario is in prompt.user_scenario.starting_prompt + # - agent traces are in candidate_responses[].agent_data.turns + # - rubric verdicts are in candidate_results[].rubric_verdicts + eval_item_map = { + "projects/123/locations/global/evaluationItems/item-1": { + "request": { + "prompt": { + "user_scenario": { + "starting_prompt": "Book a flight to Paris", + } + }, + "candidate_responses": [ + { + "candidate": "agent-1", + "agent_data": { + "turns": [ + { + "events": [ + { + "author": "user", + "content": { + "parts": [ + { + "text": "Book a flight to Paris" + } + ] + }, + } + ], + } + ] + }, + } + ], + }, + "candidate_results": [ + { + "metric": "multi_turn_task_success_v1", + "candidate": "agent-1", + "rubric_verdicts": [ + { + "evaluated_rubric": { + "rubric_id": "tool_invocation", + "content": { + "property": { + "description": "Agent should call find_flights tool" + } + }, + }, + "verdict": False, + } + ], + } + ], + }, + "projects/123/locations/global/evaluationItems/item-2": { + "request": { + "prompt": { + "user_scenario": { + "starting_prompt": "Find hotels in Tokyo", + } + }, + "candidate_responses": [ + { + "candidate": "agent-1", + "agent_data": { + "turns": [ + { + "events": [ + { + "author": "user", + "content": { + "parts": [ + {"text": "Find hotels in Tokyo"} + ] + }, + } + ], + } + ] + }, + } + ], + }, + "candidate_results": [], + }, + } + + enriched = _evals_visualization._enrich_loss_examples_with_eval_items( + results, eval_item_map + ) + + # Verify enrichment happened + assert len(enriched) == 1 + clusters = enriched[0]["clusters"] + assert len(clusters) == 1 + examples = clusters[0]["examples"] + assert len(examples) == 2 + + # First example should have evaluation_result with user_scenario + ex1 = examples[0] + assert "evaluation_result" in ex1 + er1 = ex1["evaluation_result"] + assert ( + er1["request"]["prompt"]["user_scenario"]["starting_prompt"] + == "Book a flight to Paris" + ) + # Agent data is on candidate_responses (eval run path) + assert ( + er1["request"]["candidate_responses"][0]["agent_data"]["turns"][0][ + "events" + ][0]["content"]["parts"][0]["text"] + == "Book a flight to Paris" + ) + # Rubric data + assert ( + er1["candidate_results"][0]["rubric_verdicts"][0]["evaluated_rubric"][ + "content" + ]["property"]["description"] + == "Agent should call find_flights tool" + ) + + # Second example should also have evaluation_result + ex2 = examples[1] + assert "evaluation_result" in ex2 + er2 = ex2["evaluation_result"] + assert ( + er2["request"]["prompt"]["user_scenario"]["starting_prompt"] + == "Find hotels in Tokyo" + ) + + def test_enrich_skips_already_populated_evaluation_result(self): + """Tests that enrichment skips examples that already have evaluation_result (LRO path).""" + results = [ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig(metric="m1", candidate="c1"), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + item_count=1, + examples=[ + common_types.LossExample( + evaluation_item="projects/123/locations/global/evaluationItems/item-1", + evaluation_result={ + "request": {"prompt": {"text": "original"}} + }, + ), + ], + ) + ], + ) + ] + eval_item_map = { + "projects/123/locations/global/evaluationItems/item-1": { + "request": {"prompt": {"text": "should-not-replace"}} + }, + } + enriched = _evals_visualization._enrich_loss_examples_with_eval_items( + results, eval_item_map + ) + # Should keep the original evaluation_result, not replace it + ex = enriched[0]["clusters"][0]["examples"][0] + assert ex["evaluation_result"]["request"]["prompt"]["text"] == "original" + + def test_enrich_with_none_map(self): + """Tests enrichment with no eval_item_map (backward compat).""" + results = [ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig(metric="m1", candidate="c1"), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + item_count=1, + examples=[ + common_types.LossExample( + evaluation_item="projects/123/evaluationItems/item-1", + ), + ], + ) + ], + ) + ] + enriched = _evals_visualization._enrich_loss_examples_with_eval_items( + results, None + ) + # Should not crash, evaluation_result stays absent + ex = enriched[0]["clusters"][0]["examples"][0] + assert "evaluation_result" not in ex + + def test_evaluation_run_config_serialization_with_loss_analysis(self): + """Tests that EvaluationRunConfig with loss_analysis_config serializes.""" + config = common_types.EvaluationRunConfig( + metrics=[], + loss_analysis_config=[ + common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + ], + ) + dumped = config.model_dump(mode="json", exclude_none=True) + assert "loss_analysis_config" in dumped + assert len(dumped["loss_analysis_config"]) == 1 + assert ( + dumped["loss_analysis_config"][0]["metric"] == "multi_turn_task_success_v1" + ) + + +class TestResolveEvalRunLossConfigs: + """Unit tests for _resolve_eval_run_loss_configs.""" + + def test_none_when_no_args(self): + result = _evals_utils._resolve_eval_run_loss_configs() + assert result is None + + def test_loss_analysis_metrics_single_candidate(self): + """Auto-infers candidate from single-entry inference_configs.""" + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=["multi_turn_task_success_v1"], + inference_configs={"my-agent": {}}, + ) + assert len(result) == 1 + assert result[0].metric == "multi_turn_task_success_v1" + assert result[0].candidate == "my-agent" + + def test_loss_analysis_metrics_multiple_metrics(self): + """Creates one config per metric, all with same inferred candidate.""" + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=[ + "multi_turn_task_success_v1", + "multi_turn_tool_use_quality_v1", + ], + inference_configs={"travel-agent": {}}, + ) + assert len(result) == 2 + assert result[0].metric == "multi_turn_task_success_v1" + assert result[0].candidate == "travel-agent" + assert result[1].metric == "multi_turn_tool_use_quality_v1" + assert result[1].candidate == "travel-agent" + + def test_loss_analysis_metrics_multi_candidate_raises(self): + """Raises when multiple candidates and using simplified metrics.""" + with pytest.raises(ValueError, match="multiple candidates"): + _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=["task_success_v1"], + inference_configs={"agent-a": {}, "agent-b": {}}, + ) + + def test_loss_analysis_metrics_no_inference_configs(self): + """Creates configs with candidate=None when no inference_configs.""" + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=["task_success_v1"], + ) + assert len(result) == 1 + assert result[0].metric == "task_success_v1" + assert result[0].candidate is None + + def test_loss_analysis_configs_passthrough(self): + """Explicit configs are passed through without modification.""" + configs = [ + common_types.LossAnalysisConfig( + metric="task_success_v1", + candidate="agent-1", + max_top_cluster_count=5, + ) + ] + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_configs=configs, + ) + assert len(result) == 1 + assert result[0].metric == "task_success_v1" + assert result[0].candidate == "agent-1" + assert result[0].max_top_cluster_count == 5 + + def test_loss_analysis_configs_dict_input(self): + """Dict configs are validated into LossAnalysisConfig objects.""" + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_configs=[ + {"metric": "task_success_v1", "candidate": "agent-1"} + ], + ) + assert len(result) == 1 + assert isinstance(result[0], common_types.LossAnalysisConfig) + assert result[0].metric == "task_success_v1" + + def test_loss_analysis_metrics_accepts_metric_object(self): + """Accepts Metric objects in loss_analysis_metrics.""" + metric = common_types.Metric(name="multi_turn_task_success_v1") + result = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=[metric], + inference_configs={"agent-1": {}}, + ) + assert len(result) == 1 + assert result[0].metric == "multi_turn_task_success_v1" + assert result[0].candidate == "agent-1" + + class TestResolveMetricName: """Unit tests for _resolve_metric_name.""" diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index c01c7866ae..1c2bfef81a 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -2396,14 +2396,48 @@ def _get_eval_result_from_eval_items( return eval_result +def _build_eval_item_map( + eval_items: list[types.EvaluationItem], +) -> dict[str, dict[str, Any]]: + """Builds a mapping from EvaluationItem resource name to serialized data. + + This is used by the loss analysis visualization to enrich examples with + scenario and rubric data from the original evaluation items. + + Args: + eval_items: The list of EvaluationItem objects. + + Returns: + A dict mapping evaluation item resource name to the serialized + evaluation_response dict (which the JS visualization reads as + ``evaluation_result``). + """ + item_map: dict[str, dict[str, Any]] = {} + for item in eval_items: + if item.name and item.evaluation_response: + try: + item_map[item.name] = item.evaluation_response.model_dump( + mode="json", exclude_none=True + ) + except Exception: + pass + return item_map + + def _convert_evaluation_run_results( api_client: BaseApiClient, evaluation_run_results: types.EvaluationRunResults, inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, -) -> Optional[types.EvaluationResult]: - """Retrieves an EvaluationItem from the EvaluationRunResults.""" +) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]: + """Retrieves an EvaluationResult and item map from EvaluationRunResults. + + Returns: + A tuple of (EvaluationResult, eval_item_map). The eval_item_map maps + evaluation item resource names to their serialized evaluation response + data, used for enriching loss analysis visualization. + """ if not evaluation_run_results or not evaluation_run_results.evaluation_set: - return None + return None, {} evals_module = evals.Evals(api_client_=api_client) eval_set = evals_module.get_evaluation_set( @@ -2416,19 +2450,21 @@ def _convert_evaluation_run_results( evals_module.get_evaluation_item(name=item_name) for item_name in eval_set.evaluation_items ] - return _get_eval_result_from_eval_items( + eval_result = _get_eval_result_from_eval_items( evaluation_run_results, eval_items, inference_configs ) + eval_item_map = _build_eval_item_map(eval_items) + return eval_result, eval_item_map async def _convert_evaluation_run_results_async( api_client: BaseApiClient, evaluation_run_results: types.EvaluationRunResults, inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None, -) -> Optional[types.EvaluationResult]: - """Retrieves an EvaluationItem from the EvaluationRunResults.""" +) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]: + """Retrieves an EvaluationResult and item map from EvaluationRunResults.""" if not evaluation_run_results or not evaluation_run_results.evaluation_set: - return None + return None, {} evals_module = evals.AsyncEvals(api_client_=api_client) eval_set = await evals_module.get_evaluation_set( @@ -2442,9 +2478,11 @@ async def _convert_evaluation_run_results_async( for eval_item in eval_set.evaluation_items ] eval_items = await asyncio.gather(*tasks) - return _get_eval_result_from_eval_items( + eval_result = _get_eval_result_from_eval_items( evaluation_run_results, eval_items, inference_configs ) + eval_item_map = _build_eval_item_map(eval_items) + return eval_result, eval_item_map def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]: diff --git a/vertexai/_genai/_evals_utils.py b/vertexai/_genai/_evals_utils.py index d96fb0e71f..feb24bfbf1 100644 --- a/vertexai/_genai/_evals_utils.py +++ b/vertexai/_genai/_evals_utils.py @@ -483,6 +483,64 @@ def _resolve_metric_name( return str(metric) +def _resolve_eval_run_loss_configs( + loss_analysis_metrics: Optional[list[Any]] = None, + loss_analysis_configs: Optional[list[Any]] = None, + inference_configs: Optional[dict[str, Any]] = None, +) -> Optional[list[types.LossAnalysisConfig]]: + """Resolves loss analysis configs for create_evaluation_run. + + Supports two modes: + 1. ``loss_analysis_metrics``: A simplified list of metrics. The candidate + is auto-inferred from ``inference_configs`` when there is exactly one + candidate. Each metric is resolved via ``_resolve_metric_name()``. + 2. ``loss_analysis_configs``: Explicit ``LossAnalysisConfig`` objects or + dicts for full control. + + Args: + loss_analysis_metrics: Optional list of metric references (strings, + Metric objects, or RubricMetric enums). + loss_analysis_configs: Optional list of LossAnalysisConfig or dicts. + inference_configs: The resolved inference_configs dict (candidate name + -> config). Used to auto-infer candidate for the metrics path. + + Returns: + A list of resolved LossAnalysisConfig objects, or None if neither + loss_analysis_metrics nor loss_analysis_configs is provided. + + Raises: + ValueError: If candidate cannot be inferred for loss_analysis_metrics. + """ + if not loss_analysis_metrics and not loss_analysis_configs: + return None + + if loss_analysis_configs: + return [ + types.LossAnalysisConfig.model_validate(c) if isinstance(c, dict) else c + for c in loss_analysis_configs + ] + + # loss_analysis_metrics path: auto-infer candidate from inference_configs + candidate = None + if inference_configs and len(inference_configs) == 1: + candidate = next(iter(inference_configs)) + elif inference_configs and len(inference_configs) > 1: + raise ValueError( + "Cannot infer candidate for loss analysis: multiple candidates" + f" found in inference_configs: {list(inference_configs.keys())}." + " Please use loss_analysis_configs with explicit candidate values" + " instead." + ) + + configs = [] + for m in loss_analysis_metrics or []: + metric_name = _resolve_metric_name(m) + configs.append( + types.LossAnalysisConfig(metric=metric_name, candidate=candidate) + ) + return configs + + def _resolve_loss_analysis_config( eval_result: types.EvaluationResult, config: Optional[types.LossAnalysisConfig] = None, diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py index 0fed8fbb51..a3cf2ca93f 100644 --- a/vertexai/_genai/_evals_visualization.py +++ b/vertexai/_genai/_evals_visualization.py @@ -1707,11 +1707,13 @@ def _get_loss_analysis_html(loss_analysis_json: str) -> str: // Handles both snake_case (SDK-side) and camelCase (API echo-back) keys. const er = ex.evaluation_result; if (!er) return null; - const prompt = er.request && er.request.prompt; - if (!prompt) return null; - // Try agent_data path (snake_case or camelCase) - const agentData = prompt.agent_data || prompt.agentData; - if (agentData && agentData.turns) {{ + const req = er.request; + if (!req) return null; + const prompt = req.prompt; + + // Helper: extract first user text from agent_data turns + function firstUserText(agentData) {{ + if (!agentData || !agentData.turns) return null; for (const turn of agentData.turns) {{ if (!turn.events) continue; for (const event of turn.events) {{ @@ -1726,16 +1728,47 @@ def _get_loss_analysis_html(loss_analysis_json: str) -> str: }} }} }} + return null; }} - // Try simple prompt path: request.prompt.parts[].text - if (prompt.parts) {{ - for (const part of prompt.parts) {{ - if (part.text) {{ - const text = part.text.trim(); + + if (prompt) {{ + // Path 1: prompt.agent_data.turns (LRO inline results path) + const agentData = prompt.agent_data || prompt.agentData; + const fromPromptAgent = firstUserText(agentData); + if (fromPromptAgent) return fromPromptAgent; + + // Path 2: prompt.user_scenario.starting_prompt (eval run path) + const scenario = prompt.user_scenario || prompt.userScenario; + if (scenario) {{ + const sp = scenario.starting_prompt || scenario.startingPrompt; + if (sp) {{ + const text = sp.trim(); return text.length > 150 ? text.substring(0, 150) + '...' : text; }} }} + + // Path 3: prompt.parts[].text (simple prompt path) + if (prompt.parts) {{ + for (const part of prompt.parts) {{ + if (part.text) {{ + const text = part.text.trim(); + return text.length > 150 ? text.substring(0, 150) + '...' : text; + }} + }} + }} }} + + // Path 4: candidate_responses[].agent_data.turns (eval run path - + // agent_data is on the candidate response, not the prompt) + const crs = req.candidate_responses || req.candidateResponses; + if (crs) {{ + for (const cr of crs) {{ + const ad = cr.agent_data || cr.agentData; + const fromCr = firstUserText(ad); + if (fromCr) return fromCr; + }} + }} + return null; }} }})(); @@ -1820,6 +1853,90 @@ def _get_status_html(status: str, error_message: Optional[str] = None) -> str: ) +def _enrich_loss_examples_with_eval_items( + results: list["types.LossAnalysisResult"], + eval_item_map: Optional[dict[str, dict[str, Any]]], +) -> list[dict[str, Any]]: + """Enriches loss analysis examples with eval item data for visualization. + + For the eval run path, loss examples only have ``evaluation_item`` + (a resource name) but no ``evaluation_result``. The JS visualization + needs ``evaluation_result`` to extract scenario previews and rubric + descriptions. This function joins the loss examples with the eval + item map so the visualization works identically to the LRO path. + + Args: + results: Loss analysis results from the eval run. + eval_item_map: Optional mapping from evaluation item resource name + to serialized evaluation response data (built by + ``_evals_common._build_eval_item_map``). + + Returns: + A list of dicts ready for JSON serialization, with ``evaluation_result`` + populated on each example where a match is found. + """ + result_dicts = [] + for r in results: + r_dump = r.model_dump(mode="json", exclude_none=True) + if eval_item_map: + clusters = r_dump.get("clusters", []) + for cluster in clusters: + examples = cluster.get("examples", []) + for ex in examples: + # Skip if evaluation_result is already populated (LRO path) + if ex.get("evaluation_result"): + continue + # Match by evaluation_item resource name + eval_item_ref = ex.get("evaluation_item") + if eval_item_ref and eval_item_ref in eval_item_map: + ex["evaluation_result"] = eval_item_map[eval_item_ref] + result_dicts.append(r_dump) + return result_dicts + + +def display_loss_analysis_results( + results: list["types.LossAnalysisResult"], + eval_item_map: Optional[dict[str, dict[str, Any]]] = None, +) -> None: + """Displays loss analysis results from an EvaluationRun. + + Wraps the list of LossAnalysisResult objects into the same JSON + structure used by GenerateLossClustersResponse and renders using + the shared _get_loss_analysis_html() function. + + When ``eval_item_map`` is provided (from + ``get_evaluation_run(include_evaluation_items=True)``), the examples + are enriched with scenario and rubric data for the visualization. + + Args: + results: A list of LossAnalysisResult objects from + EvaluationRunResults.loss_analysis_results. + eval_item_map: Optional mapping from evaluation item resource name + to serialized evaluation response data for enrichment. + """ + if not _is_ipython_env(): + logger.warning("Skipping display: not in an IPython environment.") + return + else: + from IPython import display + + try: + result_dicts = _enrich_loss_examples_with_eval_items(results, eval_item_map) + wrapped = {"results": result_dicts} + except Exception as e: + logger.error( + "Failed to serialize loss analysis results: %s", + e, + exc_info=True, + ) + raise + + html_content = _get_loss_analysis_html( + json.dumps(wrapped, ensure_ascii=False, default=_pydantic_serializer) + ) + display.display(display.HTML(html_content)) + + def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None: """Displays the status of an evaluation run in an IPython environment.""" if not _is_ipython_env(): diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 479bf14dd3..7dcaacbe8a 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -366,6 +366,13 @@ def _EvaluationRunConfig_from_vertex( if getv(from_object, ["promptTemplate"]) is not None: setv(to_object, ["prompt_template"], getv(from_object, ["promptTemplate"])) + if getv(from_object, ["lossAnalysisConfig"]) is not None: + setv( + to_object, + ["loss_analysis_config"], + [item for item in getv(from_object, ["lossAnalysisConfig"])], + ) + return to_object @@ -393,6 +400,13 @@ def _EvaluationRunConfig_to_vertex( if getv(from_object, ["prompt_template"]) is not None: setv(to_object, ["promptTemplate"], getv(from_object, ["prompt_template"])) + if getv(from_object, ["loss_analysis_config"]) is not None: + setv( + to_object, + ["lossAnalysisConfig"], + [item for item in getv(from_object, ["loss_analysis_config"])], + ) + return to_object @@ -2395,13 +2409,14 @@ def get_evaluation_run( name = name.split("/")[-1] result = self._get_evaluation_run(name=name, config=config) if include_evaluation_items: - result.evaluation_item_results = ( - _evals_common._convert_evaluation_run_results( - self._api_client, - result.evaluation_run_results, - result.inference_configs, - ) + eval_result, eval_item_map = _evals_common._convert_evaluation_run_results( + self._api_client, + result.evaluation_run_results, + result.inference_configs, ) + result.evaluation_item_results = eval_result + # Bypass pydantic validation (extra='forbid') for this internal field. + object.__setattr__(result, "_eval_item_map", eval_item_map) return result @_common.experimental_warning( @@ -2423,6 +2438,8 @@ def create_evaluation_run( dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, labels: Optional[dict[str, str]] = None, + loss_analysis_metrics: Optional[list[Union[str, types.MetricOrDict]]] = None, + loss_analysis_configs: Optional[list[types.LossAnalysisConfigOrDict]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: """Creates an EvaluationRun. @@ -2452,11 +2469,37 @@ def create_evaluation_run( Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. + loss_analysis_metrics: This field is experimental and may change in future + versions. Optional list of metrics to run loss analysis on. The + candidate is auto-inferred from ``inference_configs`` or + ``agent_info`` when there is exactly one candidate. Each metric can be + a string (e.g., ``"multi_turn_task_success_v1"``), a ``Metric`` + object, or a ``RubricMetric`` enum + (e.g., ``types.RubricMetric.MULTI_TURN_TASK_SUCCESS``). Loss analysis + runs after metric calculation completes. + Mutually exclusive with ``loss_analysis_configs``. + Example:: + + loss_analysis_metrics=[ + types.RubricMetric.MULTI_TURN_TASK_SUCCESS, + types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY, + ] + loss_analysis_configs: This field is experimental and may change in future + versions. Optional list of ``LossAnalysisConfig`` objects for full + control over loss analysis, including explicit candidate and + advanced options like ``predefined_taxonomy`` and + ``max_top_cluster_count``. Mutually exclusive with + ``loss_analysis_metrics``. config: The configuration for the evaluation run. Returns: The created evaluation run. """ + if loss_analysis_metrics and loss_analysis_configs: + raise ValueError( + "At most one of loss_analysis_metrics or loss_analysis_configs" + " can be provided." + ) if agent_info and inference_configs: raise ValueError( "At most one of agent_info or inference_configs can be provided." @@ -2498,8 +2541,15 @@ def create_evaluation_run( resolved_metrics = _evals_common._resolve_evaluation_run_metrics( metrics, self._api_client ) + resolved_loss_configs = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=loss_analysis_metrics, + loss_analysis_configs=loss_analysis_configs, + inference_configs=inference_configs, + ) evaluation_config = types.EvaluationRunConfig( - output_config=output_config, metrics=resolved_metrics + output_config=output_config, + metrics=resolved_metrics, + loss_analysis_config=resolved_loss_configs, ) resolved_inference_configs = _evals_common._resolve_inference_configs( self._api_client, resolved_dataset, inference_configs, parsed_agent_info @@ -3968,13 +4018,16 @@ async def get_evaluation_run( name = name.split("/")[-1] result = await self._get_evaluation_run(name=name, config=config) if include_evaluation_items: - result.evaluation_item_results = ( + eval_result, eval_item_map = ( await _evals_common._convert_evaluation_run_results_async( self._api_client, result.evaluation_run_results, result.inference_configs, ) ) + result.evaluation_item_results = eval_result + # Bypass pydantic validation (extra='forbid') for this internal field. + object.__setattr__(result, "_eval_item_map", eval_item_map) return result @@ -3997,6 +4050,8 @@ async def create_evaluation_run( dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, labels: Optional[dict[str, str]] = None, + loss_analysis_metrics: Optional[list[Union[str, types.MetricOrDict]]] = None, + loss_analysis_configs: Optional[list[types.LossAnalysisConfigOrDict]] = None, config: Optional[types.CreateEvaluationRunConfigOrDict] = None, ) -> types.EvaluationRun: """Creates an EvaluationRun. @@ -4026,11 +4081,37 @@ async def create_evaluation_run( Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. + loss_analysis_metrics: This field is experimental and may change in future + versions. Optional list of metrics to run loss analysis on. The + candidate is auto-inferred from ``inference_configs`` or + ``agent_info`` when there is exactly one candidate. Each metric can be + a string (e.g., ``"multi_turn_task_success_v1"``), a ``Metric`` + object, or a ``RubricMetric`` enum + (e.g., ``types.RubricMetric.MULTI_TURN_TASK_SUCCESS``). Loss analysis + runs after metric calculation completes. + Mutually exclusive with ``loss_analysis_configs``. + Example:: + + loss_analysis_metrics=[ + types.RubricMetric.MULTI_TURN_TASK_SUCCESS, + types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY, + ] + loss_analysis_configs: This field is experimental and may change in future + versions. Optional list of ``LossAnalysisConfig`` objects for full + control over loss analysis, including explicit candidate and + advanced options like ``predefined_taxonomy`` and + ``max_top_cluster_count``. Mutually exclusive with + ``loss_analysis_metrics``. config: The configuration for the evaluation run. Returns: The created evaluation run. """ + if loss_analysis_metrics and loss_analysis_configs: + raise ValueError( + "At most one of loss_analysis_metrics or loss_analysis_configs" + " can be provided." + ) if agent_info and inference_configs: raise ValueError( "At most one of agent_info or inference_configs can be provided." @@ -4072,8 +4153,15 @@ async def create_evaluation_run( resolved_metrics = _evals_common._resolve_evaluation_run_metrics( metrics, self._api_client ) + resolved_loss_configs = _evals_utils._resolve_eval_run_loss_configs( + loss_analysis_metrics=loss_analysis_metrics, + loss_analysis_configs=loss_analysis_configs, + inference_configs=inference_configs, + ) evaluation_config = types.EvaluationRunConfig( - output_config=output_config, metrics=resolved_metrics + output_config=output_config, + metrics=resolved_metrics, + loss_analysis_config=resolved_loss_configs, ) resolved_inference_configs = _evals_common._resolve_inference_configs( self._api_client, resolved_dataset, inference_configs, parsed_agent_info diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py index 51d0c9a32c..b741417e1b 100644 --- a/vertexai/_genai/types/__init__.py +++ b/vertexai/_genai/types/__init__.py @@ -1336,6 +1336,9 @@ "EvaluationRunPromptTemplate", "EvaluationRunPromptTemplateDict", "EvaluationRunPromptTemplateOrDict", + "LossAnalysisConfig", + "LossAnalysisConfigDict", + "LossAnalysisConfigOrDict", "EvaluationRunConfig", "EvaluationRunConfigDict", "EvaluationRunConfigOrDict", @@ -1354,6 +1357,21 @@ "SummaryMetric", "SummaryMetricDict", "SummaryMetricOrDict", + "LossTaxonomyEntry", + "LossTaxonomyEntryDict", + "LossTaxonomyEntryOrDict", + "FailedRubric", + "FailedRubricDict", + "FailedRubricOrDict", + "LossExample", + "LossExampleDict", + "LossExampleOrDict", + "LossCluster", + "LossClusterDict", + "LossClusterOrDict", + "LossAnalysisResult", + "LossAnalysisResultDict", + "LossAnalysisResultOrDict", "EvaluationRunResults", "EvaluationRunResultsDict", "EvaluationRunResultsOrDict", @@ -1546,27 +1564,9 @@ "GenerateUserScenariosResponse", "GenerateUserScenariosResponseDict", "GenerateUserScenariosResponseOrDict", - "LossAnalysisConfig", - "LossAnalysisConfigDict", - "LossAnalysisConfigOrDict", "GenerateLossClustersConfig", "GenerateLossClustersConfigDict", "GenerateLossClustersConfigOrDict", - "LossTaxonomyEntry", - "LossTaxonomyEntryDict", - "LossTaxonomyEntryOrDict", - "FailedRubric", - "FailedRubricDict", - "FailedRubricOrDict", - "LossExample", - "LossExampleDict", - "LossExampleOrDict", - "LossCluster", - "LossClusterDict", - "LossClusterOrDict", - "LossAnalysisResult", - "LossAnalysisResultDict", - "LossAnalysisResultOrDict", "GenerateLossClustersResponse", "GenerateLossClustersResponseDict", "GenerateLossClustersResponseOrDict", diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index d920b7cf06..56a038c3d6 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -2284,6 +2284,46 @@ class EvaluationRunPromptTemplateDict(TypedDict, total=False): ] +class LossAnalysisConfig(_common.BaseModel): + """Configuration for the loss analysis job.""" + + metric: Optional[str] = Field( + default=None, + description="""Required. The metric to analyze (e.g., "multi_turn_tool_use_quality_v1").""", + ) + candidate: Optional[str] = Field( + default=None, + description="""Required. The candidate model/agent to analyze (e.g., "gemini-3.1-pro-preview"). This targets the specific CandidateResult within the EvaluationResult.""", + ) + predefined_taxonomy: Optional[str] = Field( + default=None, + description="""Optional. The identifier for the pre-defined taxonomy to use (e.g., "agent_taxonomy_v1", "tool_use_v2"). If not specified, the service may select a default based on the metric.""", + ) + max_top_cluster_count: Optional[int] = Field( + default=None, + description="""Optional. Limits the analysis to the top N clusters. If not specified or set to 0, all clusters are returned.""", + ) + + +class LossAnalysisConfigDict(TypedDict, total=False): + """Configuration for the loss analysis job.""" + + metric: Optional[str] + """Required. The metric to analyze (e.g., "multi_turn_tool_use_quality_v1").""" + + candidate: Optional[str] + """Required. The candidate model/agent to analyze (e.g., "gemini-3.1-pro-preview"). This targets the specific CandidateResult within the EvaluationResult.""" + + predefined_taxonomy: Optional[str] + """Optional. The identifier for the pre-defined taxonomy to use (e.g., "agent_taxonomy_v1", "tool_use_v2"). If not specified, the service may select a default based on the metric.""" + + max_top_cluster_count: Optional[int] + """Optional. Limits the analysis to the top N clusters. If not specified or set to 0, all clusters are returned.""" + + +LossAnalysisConfigOrDict = Union[LossAnalysisConfig, LossAnalysisConfigDict] + + class EvaluationRunConfig(_common.BaseModel): """The evaluation configuration used for the evaluation run.""" @@ -2300,6 +2340,10 @@ class EvaluationRunConfig(_common.BaseModel): prompt_template: Optional[EvaluationRunPromptTemplate] = Field( default=None, description="""The prompt template used for inference.""" ) + loss_analysis_config: Optional[list[LossAnalysisConfig]] = Field( + default=None, + description="""Specifications for loss analysis. Each config specifies a metric and candidate to analyze for loss patterns.""", + ) class EvaluationRunConfigDict(TypedDict, total=False): @@ -2317,6 +2361,9 @@ class EvaluationRunConfigDict(TypedDict, total=False): prompt_template: Optional[EvaluationRunPromptTemplateDict] """The prompt template used for inference.""" + loss_analysis_config: Optional[list[LossAnalysisConfigDict]] + """Specifications for loss analysis. Each config specifies a metric and candidate to analyze for loss patterns.""" + EvaluationRunConfigOrDict = Union[EvaluationRunConfig, EvaluationRunConfigDict] @@ -2541,6 +2588,175 @@ class SummaryMetricDict(TypedDict, total=False): SummaryMetricOrDict = Union[SummaryMetric, SummaryMetricDict] +class LossTaxonomyEntry(_common.BaseModel): + """A specific entry in the loss pattern taxonomy.""" + + l1_category: Optional[str] = Field( + default=None, + description="""The primary category of the loss (e.g., "Hallucination", "Tool Calling").""", + ) + l2_category: Optional[str] = Field( + default=None, + description="""The secondary category of the loss (e.g., "Hallucination of Action", "Incorrect Tool Selection").""", + ) + description: Optional[str] = Field( + default=None, + description="""A detailed description of this loss pattern. Example: "The agent verbally confirms an action without executing the tool." """, + ) + + +class LossTaxonomyEntryDict(TypedDict, total=False): + """A specific entry in the loss pattern taxonomy.""" + + l1_category: Optional[str] + """The primary category of the loss (e.g., "Hallucination", "Tool Calling").""" + + l2_category: Optional[str] + """The secondary category of the loss (e.g., "Hallucination of Action", "Incorrect Tool Selection").""" + + description: Optional[str] + """A detailed description of this loss pattern. Example: "The agent verbally confirms an action without executing the tool." """ + + +LossTaxonomyEntryOrDict = Union[LossTaxonomyEntry, LossTaxonomyEntryDict] + + +class FailedRubric(_common.BaseModel): + """A specific failed rubric and the associated analysis.""" + + rubric_id: Optional[str] = Field( + default=None, + description="""The unique ID of the rubric (if available from the metric source).""", + ) + classification_rationale: Optional[str] = Field( + default=None, + description="""The rationale provided by the Loss Analysis Classifier for why this failure maps to this specific Loss Cluster.""", + ) + + +class FailedRubricDict(TypedDict, total=False): + """A specific failed rubric and the associated analysis.""" + + rubric_id: Optional[str] + """The unique ID of the rubric (if available from the metric source).""" + + classification_rationale: Optional[str] + """The rationale provided by the Loss Analysis Classifier for why this failure maps to this specific Loss Cluster.""" + + +FailedRubricOrDict = Union[FailedRubric, FailedRubricDict] + + +class LossExample(_common.BaseModel): + """A specific example of a loss pattern.""" + + evaluation_item: Optional[str] = Field( + default=None, + description="""Reference to the persisted EvalItem resource name. Format: projects/.../locations/.../evaluationItems/{item_id}.""", + ) + evaluation_result: Optional[dict[str, Any]] = Field( + default=None, + description="""The full evaluation result object provided inline. Used when the analysis is performed on ephemeral data.""", + ) + failed_rubrics: Optional[list[FailedRubric]] = Field( + default=None, + description="""The specific rubric(s) that failed and caused this example to be classified here. An example might fail multiple rubrics, but only specific ones trigger this loss pattern.""", + ) + + +class LossExampleDict(TypedDict, total=False): + """A specific example of a loss pattern.""" + + evaluation_item: Optional[str] + """Reference to the persisted EvalItem resource name. Format: projects/.../locations/.../evaluationItems/{item_id}.""" + + evaluation_result: Optional[dict[str, Any]] + """The full evaluation result object provided inline. Used when the analysis is performed on ephemeral data.""" + + failed_rubrics: Optional[list[FailedRubricDict]] + """The specific rubric(s) that failed and caused this example to be classified here. An example might fail multiple rubrics, but only specific ones trigger this loss pattern.""" + + +LossExampleOrDict = Union[LossExample, LossExampleDict] + + +class LossCluster(_common.BaseModel): + """A semantic grouping of failures (e.g., "Hallucination of Action").""" + + cluster_id: Optional[str] = Field( + default=None, + description="""Unique identifier for the loss cluster within the scope of the analysis result.""", + ) + taxonomy_entry: Optional[LossTaxonomyEntry] = Field( + default=None, + description="""The structured definition of the loss taxonomy for this cluster.""", + ) + item_count: Optional[int] = Field( + default=None, + description="""The total number of EvaluationItems falling into this cluster.""", + ) + examples: Optional[list[LossExample]] = Field( + default=None, + description="""A list of examples that belong to this cluster. This links the cluster back to the specific EvaluationItems and Rubrics.""", + ) + + +class LossClusterDict(TypedDict, total=False): + """A semantic grouping of failures (e.g., "Hallucination of Action").""" + + cluster_id: Optional[str] + """Unique identifier for the loss cluster within the scope of the analysis result.""" + + taxonomy_entry: Optional[LossTaxonomyEntryDict] + """The structured definition of the loss taxonomy for this cluster.""" + + item_count: Optional[int] + """The total number of EvaluationItems falling into this cluster.""" + + examples: Optional[list[LossExampleDict]] + """A list of examples that belong to this cluster. This links the cluster back to the specific EvaluationItems and Rubrics.""" + + +LossClusterOrDict = Union[LossCluster, LossClusterDict] + + +class LossAnalysisResult(_common.BaseModel): + """The top-level result for loss analysis.""" + + config: Optional[LossAnalysisConfig] = Field( + default=None, + description="""The configuration used to generate this analysis.""", + ) + analysis_time: Optional[str] = Field( + default=None, description="""The timestamp when this analysis was performed.""" + ) + clusters: Optional[list[LossCluster]] = Field( + default=None, description="""The list of identified loss clusters.""" + ) + + def show(self) -> None: + """Shows the loss analysis result with rich HTML visualization.""" + from .. import _evals_visualization + + _evals_visualization.display_loss_analysis_result(self) + + +class LossAnalysisResultDict(TypedDict, total=False): + """The top-level result for loss analysis.""" + + config: Optional[LossAnalysisConfigDict] + """The configuration used to generate this analysis.""" + + analysis_time: Optional[str] + """The timestamp when this analysis was performed.""" + + clusters: Optional[list[LossClusterDict]] + """The list of identified loss clusters.""" + + +LossAnalysisResultOrDict = Union[LossAnalysisResult, LossAnalysisResultDict] + + class EvaluationRunResults(_common.BaseModel): """Represents the results of an evaluation run.""" @@ -2551,6 +2767,10 @@ class EvaluationRunResults(_common.BaseModel): summary_metrics: Optional[SummaryMetric] = Field( default=None, description="""The summary metrics for the evaluation run.""" ) + loss_analysis_results: Optional[list[LossAnalysisResult]] = Field( + default=None, + description="""The loss analysis results for the evaluation run.""", + ) class EvaluationRunResultsDict(TypedDict, total=False): @@ -2562,6 +2782,9 @@ class EvaluationRunResultsDict(TypedDict, total=False): summary_metrics: Optional[SummaryMetricDict] """The summary metrics for the evaluation run.""" + loss_analysis_results: Optional[list[LossAnalysisResultDict]] + """The loss analysis results for the evaluation run.""" + EvaluationRunResultsOrDict = Union[EvaluationRunResults, EvaluationRunResultsDict] @@ -3145,6 +3368,18 @@ def show(self) -> None: logger.warning( "Evaluation Run succeeded but no evaluation item results found. To display results, please set include_evaluation_items to True when calling get_evaluation_run()." ) + # Show loss analysis results if present on the evaluation run. + # Pass the eval item map so the visualization can enrich + # loss examples with scenario/rubric data. + if ( + self.evaluation_run_results + and self.evaluation_run_results.loss_analysis_results + ): + eval_item_map = getattr(self, "_eval_item_map", None) + _evals_visualization.display_loss_analysis_results( + self.evaluation_run_results.loss_analysis_results, + eval_item_map=eval_item_map, + ) else: _evals_visualization.display_evaluation_run_status(self) @@ -4726,46 +4961,6 @@ class GenerateUserScenariosResponseDict(TypedDict, total=False): ] -class LossAnalysisConfig(_common.BaseModel): - """Configuration for the loss analysis job.""" - - metric: Optional[str] = Field( - default=None, - description="""Required. The metric to analyze (e.g., "multi_turn_tool_use_quality_v1").""", - ) - candidate: Optional[str] = Field( - default=None, - description="""Required. The candidate model/agent to analyze (e.g., "gemini-3.1-pro-preview"). This targets the specific CandidateResult within the EvaluationResult.""", - ) - predefined_taxonomy: Optional[str] = Field( - default=None, - description="""Optional. The identifier for the pre-defined taxonomy to use (e.g., "agent_taxonomy_v1", "tool_use_v2"). If not specified, the service may select a default based on the metric.""", - ) - max_top_cluster_count: Optional[int] = Field( - default=None, - description="""Optional. Limits the analysis to the top N clusters. If not specified or set to 0, all clusters are returned.""", - ) - - -class LossAnalysisConfigDict(TypedDict, total=False): - """Configuration for the loss analysis job.""" - - metric: Optional[str] - """Required. The metric to analyze (e.g., "multi_turn_tool_use_quality_v1").""" - - candidate: Optional[str] - """Required. The candidate model/agent to analyze (e.g., "gemini-3.1-pro-preview"). This targets the specific CandidateResult within the EvaluationResult.""" - - predefined_taxonomy: Optional[str] - """Optional. The identifier for the pre-defined taxonomy to use (e.g., "agent_taxonomy_v1", "tool_use_v2"). If not specified, the service may select a default based on the metric.""" - - max_top_cluster_count: Optional[int] - """Optional. Limits the analysis to the top N clusters. If not specified or set to 0, all clusters are returned.""" - - -LossAnalysisConfigOrDict = Union[LossAnalysisConfig, LossAnalysisConfigDict] - - class GenerateLossClustersConfig(_common.BaseModel): """Config for generating loss clusters.""" @@ -4834,175 +5029,6 @@ class _GenerateLossClustersParametersDict(TypedDict, total=False): ] -class LossTaxonomyEntry(_common.BaseModel): - """A specific entry in the loss pattern taxonomy.""" - - l1_category: Optional[str] = Field( - default=None, - description="""The primary category of the loss (e.g., "Hallucination", "Tool Calling").""", - ) - l2_category: Optional[str] = Field( - default=None, - description="""The secondary category of the loss (e.g., "Hallucination of Action", "Incorrect Tool Selection").""", - ) - description: Optional[str] = Field( - default=None, - description="""A detailed description of this loss pattern. Example: "The agent verbally confirms an action without executing the tool." """, - ) - - -class LossTaxonomyEntryDict(TypedDict, total=False): - """A specific entry in the loss pattern taxonomy.""" - - l1_category: Optional[str] - """The primary category of the loss (e.g., "Hallucination", "Tool Calling").""" - - l2_category: Optional[str] - """The secondary category of the loss (e.g., "Hallucination of Action", "Incorrect Tool Selection").""" - - description: Optional[str] - """A detailed description of this loss pattern. Example: "The agent verbally confirms an action without executing the tool." """ - - -LossTaxonomyEntryOrDict = Union[LossTaxonomyEntry, LossTaxonomyEntryDict] - - -class FailedRubric(_common.BaseModel): - """A specific failed rubric and the associated analysis.""" - - rubric_id: Optional[str] = Field( - default=None, - description="""The unique ID of the rubric (if available from the metric source).""", - ) - classification_rationale: Optional[str] = Field( - default=None, - description="""The rationale provided by the Loss Analysis Classifier for why this failure maps to this specific Loss Cluster.""", - ) - - -class FailedRubricDict(TypedDict, total=False): - """A specific failed rubric and the associated analysis.""" - - rubric_id: Optional[str] - """The unique ID of the rubric (if available from the metric source).""" - - classification_rationale: Optional[str] - """The rationale provided by the Loss Analysis Classifier for why this failure maps to this specific Loss Cluster.""" - - -FailedRubricOrDict = Union[FailedRubric, FailedRubricDict] - - -class LossExample(_common.BaseModel): - """A specific example of a loss pattern.""" - - evaluation_item: Optional[str] = Field( - default=None, - description="""Reference to the persisted EvalItem resource name. Format: projects/.../locations/.../evaluationItems/{item_id}.""", - ) - evaluation_result: Optional[dict[str, Any]] = Field( - default=None, - description="""The full evaluation result object provided inline. Used when the analysis is performed on ephemeral data.""", - ) - failed_rubrics: Optional[list[FailedRubric]] = Field( - default=None, - description="""The specific rubric(s) that failed and caused this example to be classified here. An example might fail multiple rubrics, but only specific ones trigger this loss pattern.""", - ) - - -class LossExampleDict(TypedDict, total=False): - """A specific example of a loss pattern.""" - - evaluation_item: Optional[str] - """Reference to the persisted EvalItem resource name. Format: projects/.../locations/.../evaluationItems/{item_id}.""" - - evaluation_result: Optional[dict[str, Any]] - """The full evaluation result object provided inline. Used when the analysis is performed on ephemeral data.""" - - failed_rubrics: Optional[list[FailedRubricDict]] - """The specific rubric(s) that failed and caused this example to be classified here. An example might fail multiple rubrics, but only specific ones trigger this loss pattern.""" - - -LossExampleOrDict = Union[LossExample, LossExampleDict] - - -class LossCluster(_common.BaseModel): - """A semantic grouping of failures (e.g., "Hallucination of Action").""" - - cluster_id: Optional[str] = Field( - default=None, - description="""Unique identifier for the loss cluster within the scope of the analysis result.""", - ) - taxonomy_entry: Optional[LossTaxonomyEntry] = Field( - default=None, - description="""The structured definition of the loss taxonomy for this cluster.""", - ) - item_count: Optional[int] = Field( - default=None, - description="""The total number of EvaluationItems falling into this cluster.""", - ) - examples: Optional[list[LossExample]] = Field( - default=None, - description="""A list of examples that belong to this cluster. This links the cluster back to the specific EvaluationItems and Rubrics.""", - ) - - -class LossClusterDict(TypedDict, total=False): - """A semantic grouping of failures (e.g., "Hallucination of Action").""" - - cluster_id: Optional[str] - """Unique identifier for the loss cluster within the scope of the analysis result.""" - - taxonomy_entry: Optional[LossTaxonomyEntryDict] - """The structured definition of the loss taxonomy for this cluster.""" - - item_count: Optional[int] - """The total number of EvaluationItems falling into this cluster.""" - - examples: Optional[list[LossExampleDict]] - """A list of examples that belong to this cluster. This links the cluster back to the specific EvaluationItems and Rubrics.""" - - -LossClusterOrDict = Union[LossCluster, LossClusterDict] - - -class LossAnalysisResult(_common.BaseModel): - """The top-level result for loss analysis.""" - - config: Optional[LossAnalysisConfig] = Field( - default=None, - description="""The configuration used to generate this analysis.""", - ) - analysis_time: Optional[str] = Field( - default=None, description="""The timestamp when this analysis was performed.""" - ) - clusters: Optional[list[LossCluster]] = Field( - default=None, description="""The list of identified loss clusters.""" - ) - - def show(self) -> None: - """Shows the loss analysis result with rich HTML visualization.""" - from .. import _evals_visualization - - _evals_visualization.display_loss_analysis_result(self) - - -class LossAnalysisResultDict(TypedDict, total=False): - """The top-level result for loss analysis.""" - - config: Optional[LossAnalysisConfigDict] - """The configuration used to generate this analysis.""" - - analysis_time: Optional[str] - """The timestamp when this analysis was performed.""" - - clusters: Optional[list[LossClusterDict]] - """The list of identified loss clusters.""" - - -LossAnalysisResultOrDict = Union[LossAnalysisResult, LossAnalysisResultDict] - - class GenerateLossClustersResponse(_common.BaseModel): """Response message for EvaluationAnalyticsService.GenerateLossClusters."""