Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
561 changes: 561 additions & 0 deletions tests/unit/vertexai/genai/test_evals.py

Large diffs are not rendered by default.

54 changes: 46 additions & 8 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2396,14 +2396,48 @@ def _get_eval_result_from_eval_items(
return eval_result


def _build_eval_item_map(
eval_items: list[types.EvaluationItem],
) -> dict[str, dict[str, Any]]:
"""Builds a mapping from EvaluationItem resource name to serialized data.

This is used by the loss analysis visualization to enrich examples with
scenario and rubric data from the original evaluation items.

Args:
eval_items: The list of EvaluationItem objects.

Returns:
A dict mapping evaluation item resource name to the serialized
evaluation_response dict (which the JS visualization reads as
``evaluation_result``).
"""
item_map: dict[str, dict[str, Any]] = {}
for item in eval_items:
if item.name and item.evaluation_response:
try:
item_map[item.name] = item.evaluation_response.model_dump(
mode="json", exclude_none=True
)
except Exception:
pass
return item_map


def _convert_evaluation_run_results(
api_client: BaseApiClient,
evaluation_run_results: types.EvaluationRunResults,
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
) -> Optional[types.EvaluationResult]:
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]:
"""Retrieves an EvaluationResult and item map from EvaluationRunResults.

Returns:
A tuple of (EvaluationResult, eval_item_map). The eval_item_map maps
evaluation item resource names to their serialized evaluation response
data, used for enriching loss analysis visualization.
"""
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
return None
return None, {}

evals_module = evals.Evals(api_client_=api_client)
eval_set = evals_module.get_evaluation_set(
Expand All @@ -2416,19 +2450,21 @@ def _convert_evaluation_run_results(
evals_module.get_evaluation_item(name=item_name)
for item_name in eval_set.evaluation_items
]
return _get_eval_result_from_eval_items(
eval_result = _get_eval_result_from_eval_items(
evaluation_run_results, eval_items, inference_configs
)
eval_item_map = _build_eval_item_map(eval_items)
return eval_result, eval_item_map


async def _convert_evaluation_run_results_async(
api_client: BaseApiClient,
evaluation_run_results: types.EvaluationRunResults,
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
) -> Optional[types.EvaluationResult]:
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
) -> tuple[Optional[types.EvaluationResult], dict[str, dict[str, Any]]]:
"""Retrieves an EvaluationResult and item map from EvaluationRunResults."""
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
return None
return None, {}

evals_module = evals.AsyncEvals(api_client_=api_client)
eval_set = await evals_module.get_evaluation_set(
Expand All @@ -2442,9 +2478,11 @@ async def _convert_evaluation_run_results_async(
for eval_item in eval_set.evaluation_items
]
eval_items = await asyncio.gather(*tasks)
return _get_eval_result_from_eval_items(
eval_result = _get_eval_result_from_eval_items(
evaluation_run_results, eval_items, inference_configs
)
eval_item_map = _build_eval_item_map(eval_items)
return eval_result, eval_item_map


def _object_to_dict(obj: Any) -> Union[dict[str, Any], Any]:
Expand Down
58 changes: 58 additions & 0 deletions vertexai/_genai/_evals_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,64 @@ def _resolve_metric_name(
return str(metric)


def _resolve_eval_run_loss_configs(
loss_analysis_metrics: Optional[list[Any]] = None,
loss_analysis_configs: Optional[list[Any]] = None,
inference_configs: Optional[dict[str, Any]] = None,
) -> Optional[list[types.LossAnalysisConfig]]:
"""Resolves loss analysis configs for create_evaluation_run.

Supports two modes:
1. ``loss_analysis_metrics``: A simplified list of metrics. The candidate
is auto-inferred from ``inference_configs`` when there is exactly one
candidate. Each metric is resolved via ``_resolve_metric_name()``.
2. ``loss_analysis_configs``: Explicit ``LossAnalysisConfig`` objects or
dicts for full control.

Args:
loss_analysis_metrics: Optional list of metric references (strings,
Metric objects, or RubricMetric enums).
loss_analysis_configs: Optional list of LossAnalysisConfig or dicts.
inference_configs: The resolved inference_configs dict (candidate name
-> config). Used to auto-infer candidate for the metrics path.

Returns:
A list of resolved LossAnalysisConfig objects, or None if neither
loss_analysis_metrics nor loss_analysis_configs is provided.

Raises:
ValueError: If candidate cannot be inferred for loss_analysis_metrics.
"""
if not loss_analysis_metrics and not loss_analysis_configs:
return None

if loss_analysis_configs:
return [
types.LossAnalysisConfig.model_validate(c) if isinstance(c, dict) else c
for c in loss_analysis_configs
]

# loss_analysis_metrics path: auto-infer candidate from inference_configs
candidate = None
if inference_configs and len(inference_configs) == 1:
candidate = next(iter(inference_configs))
elif inference_configs and len(inference_configs) > 1:
raise ValueError(
"Cannot infer candidate for loss analysis: multiple candidates"
f" found in inference_configs: {list(inference_configs.keys())}."
" Please use loss_analysis_configs with explicit candidate values"
" instead."
)

configs = []
for m in loss_analysis_metrics or []:
metric_name = _resolve_metric_name(m)
configs.append(
types.LossAnalysisConfig(metric=metric_name, candidate=candidate)
)
return configs


def _resolve_loss_analysis_config(
eval_result: types.EvaluationResult,
config: Optional[types.LossAnalysisConfig] = None,
Expand Down
137 changes: 127 additions & 10 deletions vertexai/_genai/_evals_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,11 +1707,13 @@ def _get_loss_analysis_html(loss_analysis_json: str) -> str:
// Handles both snake_case (SDK-side) and camelCase (API echo-back) keys.
const er = ex.evaluation_result;
if (!er) return null;
const prompt = er.request && er.request.prompt;
if (!prompt) return null;
// Try agent_data path (snake_case or camelCase)
const agentData = prompt.agent_data || prompt.agentData;
if (agentData && agentData.turns) {{
const req = er.request;
if (!req) return null;
const prompt = req.prompt;

// Helper: extract first user text from agent_data turns
function firstUserText(agentData) {{
if (!agentData || !agentData.turns) return null;
for (const turn of agentData.turns) {{
if (!turn.events) continue;
for (const event of turn.events) {{
Expand All @@ -1726,16 +1728,47 @@ def _get_loss_analysis_html(loss_analysis_json: str) -> str:
}}
}}
}}
return null;
}}
// Try simple prompt path: request.prompt.parts[].text
if (prompt.parts) {{
for (const part of prompt.parts) {{
if (part.text) {{
const text = part.text.trim();

if (prompt) {{
// Path 1: prompt.agent_data.turns (LRO inline results path)
const agentData = prompt.agent_data || prompt.agentData;
const fromPromptAgent = firstUserText(agentData);
if (fromPromptAgent) return fromPromptAgent;

// Path 2: prompt.user_scenario.starting_prompt (eval run path)
const scenario = prompt.user_scenario || prompt.userScenario;
if (scenario) {{
const sp = scenario.starting_prompt || scenario.startingPrompt;
if (sp) {{
const text = sp.trim();
return text.length > 150 ? text.substring(0, 150) + '...' : text;
}}
}}

// Path 3: prompt.parts[].text (simple prompt path)
if (prompt.parts) {{
for (const part of prompt.parts) {{
if (part.text) {{
const text = part.text.trim();
return text.length > 150 ? text.substring(0, 150) + '...' : text;
}}
}}
}}
}}

// Path 4: candidate_responses[].agent_data.turns (eval run path -
// agent_data is on the candidate response, not the prompt)
const crs = req.candidate_responses || req.candidateResponses;
if (crs) {{
for (const cr of crs) {{
const ad = cr.agent_data || cr.agentData;
const fromCr = firstUserText(ad);
if (fromCr) return fromCr;
}}
}}

return null;
}}
}})();
Expand Down Expand Up @@ -1820,6 +1853,90 @@ def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
)


def _enrich_loss_examples_with_eval_items(
results: list["types.LossAnalysisResult"],
eval_item_map: Optional[dict[str, dict[str, Any]]],
) -> list[dict[str, Any]]:
"""Enriches loss analysis examples with eval item data for visualization.

For the eval run path, loss examples only have ``evaluation_item``
(a resource name) but no ``evaluation_result``. The JS visualization
needs ``evaluation_result`` to extract scenario previews and rubric
descriptions. This function joins the loss examples with the eval
item map so the visualization works identically to the LRO path.

Args:
results: Loss analysis results from the eval run.
eval_item_map: Optional mapping from evaluation item resource name
to serialized evaluation response data (built by
``_evals_common._build_eval_item_map``).

Returns:
A list of dicts ready for JSON serialization, with ``evaluation_result``
populated on each example where a match is found.
"""
result_dicts = []
for r in results:
r_dump = r.model_dump(mode="json", exclude_none=True)
if eval_item_map:
clusters = r_dump.get("clusters", [])
for cluster in clusters:
examples = cluster.get("examples", [])
for ex in examples:
# Skip if evaluation_result is already populated (LRO path)
if ex.get("evaluation_result"):
continue
# Match by evaluation_item resource name
eval_item_ref = ex.get("evaluation_item")
if eval_item_ref and eval_item_ref in eval_item_map:
ex["evaluation_result"] = eval_item_map[eval_item_ref]
result_dicts.append(r_dump)
return result_dicts


def display_loss_analysis_results(
results: list["types.LossAnalysisResult"],
eval_item_map: Optional[dict[str, dict[str, Any]]] = None,
) -> None:
"""Displays loss analysis results from an EvaluationRun.

Wraps the list of LossAnalysisResult objects into the same JSON
structure used by GenerateLossClustersResponse and renders using
the shared _get_loss_analysis_html() function.

When ``eval_item_map`` is provided (from
``get_evaluation_run(include_evaluation_items=True)``), the examples
are enriched with scenario and rubric data for the visualization.

Args:
results: A list of LossAnalysisResult objects from
EvaluationRunResults.loss_analysis_results.
eval_item_map: Optional mapping from evaluation item resource name
to serialized evaluation response data for enrichment.
"""
if not _is_ipython_env():
logger.warning("Skipping display: not in an IPython environment.")
return
else:
from IPython import display

try:
result_dicts = _enrich_loss_examples_with_eval_items(results, eval_item_map)
wrapped = {"results": result_dicts}
except Exception as e:
logger.error(
"Failed to serialize loss analysis results: %s",
e,
exc_info=True,
)
raise

html_content = _get_loss_analysis_html(
json.dumps(wrapped, ensure_ascii=False, default=_pydantic_serializer)
)
display.display(display.HTML(html_content))


def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
"""Displays the status of an evaluation run in an IPython environment."""
if not _is_ipython_env():
Expand Down
Loading
Loading