diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index 598d79440e..355d5d9942 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -4691,6 +4691,34 @@ def test_convert_with_conversation_history(self): eval_case.conversation_history[1].content.parts[0].text == "Old model msg" ) + def test_convert_with_conversation_history_column_name(self): + """Tests that 'conversation_history' is accepted as a column name alias for 'history'.""" + raw_data_df = pd.DataFrame( + { + "prompt": ["Current prompt"], + "response": ["A response"], + "conversation_history": [ + [ + {"role": "user", "parts": [{"text": "Old user msg"}]}, + {"role": "model", "parts": [{"text": "Old model msg"}]}, + ] + ], + } + ) + raw_data = raw_data_df.to_dict(orient="records") + result_dataset = self.converter.convert(raw_data) + eval_case = result_dataset.eval_cases[0] + + assert eval_case.prompt == genai_types.Content( + parts=[genai_types.Part(text="Current prompt")] + ) + assert eval_case.reference is None + assert len(eval_case.conversation_history) == 2 + assert eval_case.conversation_history[0].content.parts[0].text == "Old user msg" + assert ( + eval_case.conversation_history[1].content.parts[0].text == "Old model msg" + ) + def test_convert_missing_response_raises_value_error(self): raw_data_df = pd.DataFrame({"prompt": ["Hello"]}) # Missing response raw_data = raw_data_df.to_dict(orient="records") @@ -8355,6 +8383,176 @@ def test_create_evaluation_set_with_agent_data( assert candidate_response["candidate"] == "test-candidate" assert candidate_response["agent_data"] == agent_data + @mock.patch.object(_evals_common, "evals") + @mock.patch.object(_evals_common, "_gcs_utils") + def test_create_evaluation_set_with_history_column( + self, mock_gcs_utils, mock_evals_module + ): + """Tests that 'history' column is accepted and mapped to prompt_template_data.""" + eval_df = pd.DataFrame( + [ + { + "prompt": "test prompt", + "response": "test response", + "history": "previous conversation", + } + ] + ) + + mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value + mock_gcs_instance.upload_json_to_prefix.return_value = ( + "gs://bucket/path/request.json" + ) + + mock_evals_instance = mock_evals_module.Evals.return_value + mock_eval_item = mock.Mock() + mock_eval_item.name = "eval_item_1" + mock_evals_instance.create_evaluation_item.return_value = mock_eval_item + + mock_eval_set = mock.Mock() + mock_evals_instance.create_evaluation_set.return_value = mock_eval_set + + result = _evals_common._create_evaluation_set_from_dataframe( + api_client=self.mock_api_client, + gcs_dest_prefix="gs://bucket/prefix", + eval_df=eval_df, + candidate_name="test-candidate", + ) + + assert result == mock_eval_set + + mock_gcs_instance.upload_json_to_prefix.assert_called_once() + call_args = mock_gcs_instance.upload_json_to_prefix.call_args + uploaded_data = call_args.kwargs["data"] + + assert "prompt_template_data" in uploaded_data["prompt"] + ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"] + assert "conversation_history" in ptd_values + + @mock.patch.object(_evals_common, "evals") + @mock.patch.object(_evals_common, "_gcs_utils") + def test_create_evaluation_set_with_conversation_history_column( + self, mock_gcs_utils, mock_evals_module + ): + """Tests that 'conversation_history' column is accepted and mapped to prompt_template_data.""" + eval_df = pd.DataFrame( + [ + { + "prompt": "test prompt", + "response": "test response", + "conversation_history": "previous conversation", + } + ] + ) + + mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value + mock_gcs_instance.upload_json_to_prefix.return_value = ( + "gs://bucket/path/request.json" + ) + + mock_evals_instance = mock_evals_module.Evals.return_value + mock_eval_item = mock.Mock() + mock_eval_item.name = "eval_item_1" + mock_evals_instance.create_evaluation_item.return_value = mock_eval_item + + mock_eval_set = mock.Mock() + mock_evals_instance.create_evaluation_set.return_value = mock_eval_set + + result = _evals_common._create_evaluation_set_from_dataframe( + api_client=self.mock_api_client, + gcs_dest_prefix="gs://bucket/prefix", + eval_df=eval_df, + candidate_name="test-candidate", + ) + + assert result == mock_eval_set + + mock_gcs_instance.upload_json_to_prefix.assert_called_once() + call_args = mock_gcs_instance.upload_json_to_prefix.call_args + uploaded_data = call_args.kwargs["data"] + + assert "prompt_template_data" in uploaded_data["prompt"] + ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"] + assert "conversation_history" in ptd_values + + +class TestResolveDataset: + """Unit tests for the _resolve_dataset function.""" + + def setup_method(self): + self.mock_api_client = mock.Mock(spec=client.Client) + self.mock_api_client.project = "test-project" + self.mock_api_client.location = "us-central1" + + @mock.patch.object(_evals_common, "evals") + @mock.patch.object(_evals_common, "_gcs_utils") + def test_resolve_dataset_preserves_conversation_history( + self, mock_gcs_utils, mock_evals_module + ): + """Tests that conversation_history from EvalCase is included in the DataFrame.""" + mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value + mock_gcs_instance.upload_json_to_prefix.return_value = ( + "gs://bucket/path/request.json" + ) + + mock_evals_instance = mock_evals_module.Evals.return_value + mock_eval_item = mock.Mock() + mock_eval_item.name = "eval_item_1" + mock_evals_instance.create_evaluation_item.return_value = mock_eval_item + + mock_eval_set = mock.Mock() + mock_eval_set.name = "eval_set_1" + mock_evals_instance.create_evaluation_set.return_value = mock_eval_set + + history_content_1 = genai_types.Content( + role="user", parts=[genai_types.Part(text="Old user msg")] + ) + history_content_2 = genai_types.Content( + role="model", parts=[genai_types.Part(text="Old model msg")] + ) + + dataset = vertexai_genai_types.EvaluationDataset( + eval_cases=[ + vertexai_genai_types.EvalCase( + prompt=genai_types.Content( + parts=[genai_types.Part(text="test prompt")] + ), + responses=[ + vertexai_genai_types.ResponseCandidate( + response=genai_types.Content( + parts=[genai_types.Part(text="test response")] + ) + ) + ], + conversation_history=[ + vertexai_genai_types.evals.Message( + turn_id="0", content=history_content_1 + ), + vertexai_genai_types.evals.Message( + turn_id="1", content=history_content_2 + ), + ], + ) + ] + ) + + result = _evals_common._resolve_dataset( + api_client=self.mock_api_client, + dataset=dataset, + dest="gs://bucket/prefix", + ) + + assert result.evaluation_set == "eval_set_1" + + # Verify that conversation_history was passed through to the GCS upload + mock_gcs_instance.upload_json_to_prefix.assert_called_once() + call_args = mock_gcs_instance.upload_json_to_prefix.call_args + uploaded_data = call_args.kwargs["data"] + + assert "prompt_template_data" in uploaded_data["prompt"] + ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"] + assert "conversation_history" in ptd_values + class TestRateLimiter: """Tests for the RateLimiter class in _evals_utils.""" diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 1c2bfef81a..4e00aa6d09 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -326,6 +326,20 @@ def _resolve_dataset( if event.content ] + if case.conversation_history: + history_parts = [] + for msg in case.conversation_history: + if msg.content: + role = msg.content.role or "user" + text = _evals_data_converters._get_content_text( + msg.content + ) + history_parts.append(f"{role}: {text}") + if history_parts: + row[_evals_constant.CONVERSATION_HISTORY] = "\n".join( + history_parts + ) + if case.user_scenario: if case.user_scenario.starting_prompt: row[_evals_constant.STARTING_PROMPT] = ( @@ -2586,6 +2600,14 @@ def _create_evaluation_set_from_dataframe( ) prompt = None + # Determine which history column name is present, preferring + # "conversation_history" over "history" if both exist. + history_col = None + if _evals_constant.CONVERSATION_HISTORY in row: + history_col = _evals_constant.CONVERSATION_HISTORY + elif _evals_constant.HISTORY in row: + history_col = _evals_constant.HISTORY + if ( _evals_constant.STARTING_PROMPT in row and _evals_constant.CONVERSATION_PLAN in row @@ -2596,15 +2618,15 @@ def _create_evaluation_set_from_dataframe( conversation_plan=row[_evals_constant.CONVERSATION_PLAN], ) ) - elif _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row: + elif _evals_constant.CONTEXT in row or history_col: values = {} if _evals_constant.CONTEXT in row: values[_evals_constant.CONTEXT] = _get_content( row, _evals_constant.CONTEXT ) - if _evals_constant.HISTORY in row: - values[_evals_constant.HISTORY] = _get_content( - row, _evals_constant.HISTORY + if history_col: + values[_evals_constant.CONVERSATION_HISTORY] = _get_content( + row, history_col ) if _evals_constant.PROMPT in row: values[_evals_constant.PROMPT] = _get_content( diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py index f825820f09..822f8e685a 100644 --- a/vertexai/_genai/_evals_constant.py +++ b/vertexai/_genai/_evals_constant.py @@ -59,7 +59,8 @@ AGENT_DATA = "agent_data" STARTING_PROMPT = "starting_prompt" CONVERSATION_PLAN = "conversation_plan" -HISTORY = "conversation_history" +HISTORY = "history" +CONVERSATION_HISTORY = "conversation_history" COMMON_DATASET_COLUMNS = frozenset( { @@ -69,6 +70,7 @@ SESSION_INPUT, CONTEXT, HISTORY, + CONVERSATION_HISTORY, STARTING_PROMPT, CONVERSATION_PLAN, AGENT_DATA, diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index c194c3a9d5..16e7779ad9 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -196,7 +196,9 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset: if not prompt_data: prompt_data = item.pop("source", None) - conversation_history_data = item.pop("history", None) + conversation_history_data = item.pop("conversation_history", None) + if conversation_history_data is None: + conversation_history_data = item.pop("history", None) response_data = item.pop("response", None) reference_data = item.pop("reference", None) system_instruction_data = item.pop("instruction", None)