From f6bfa9c55a791cb4bc8f7030d433aff7eba29fb1 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Thu, 28 May 2026 23:57:28 -0700 Subject: [PATCH 1/5] fix(eval): use function calling for tool/input mocking so non-OpenAI models work Tool simulation and input generation in Studio Debug and Evaluation Set runs failed with AGENT_RUNTIME.UNEXPECTED_ERROR for non-OpenAI models (Anthropic Claude via Bedrock, Gemini). The mockers requested structured output via OpenAI-only `response_format` json_schema and parsed `choices[0].message.content`; for Claude that content is empty/None, so `json.loads(...)` raised. Switch both mockers to provider-agnostic function calling (mirrors llm_as_judge_evaluator): build a forced tool that wraps the output/input schema under a `response` property, force it via tool_choice, and read `tool_calls[0].arguments["response"]` (already a parsed dict). Hoist nested `$defs` to the tool-parameters root so `$ref`s from nested Pydantic models still resolve. The normalized LLM gateway now accepts raw-dict tools so arbitrary nested schemas survive (the ToolDefinition converter only emits flat properties). Regression introduced by #1555, which started routing the agent's model into simulations; before that, simulation always used a fixed OpenAI model, so non-OpenAI providers were never exercised on this path. Fixes AE-1646. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../platform/chat/_llm_gateway_service.py | 11 +- .../services/test_uipath_llm_integration.py | 82 ++++++ .../src/uipath/eval/mocks/_input_mocker.py | 29 +- .../src/uipath/eval/mocks/_llm_mocker.py | 29 +- .../uipath/eval/mocks/_structured_output.py | 70 +++++ .../tests/cli/eval/mocks/test_input_mocker.py | 26 +- .../cli/eval/mocks/test_input_mocker_span.py | 30 ++- .../uipath/tests/cli/eval/mocks/test_mocks.py | 248 ++++++++++++++---- .../cli/eval/mocks/test_structured_output.py | 73 ++++++ 9 files changed, 503 insertions(+), 95 deletions(-) create mode 100644 packages/uipath/src/uipath/eval/mocks/_structured_output.py create mode 100644 packages/uipath/tests/cli/eval/mocks/test_structured_output.py diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py index ffe0bff99..bc89fd82a 100644 --- a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py +++ b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py @@ -401,7 +401,7 @@ async def chat_completions( presence_penalty: float = 0, top_p: float | None = 1, top_k: int | None = None, - tools: list[ToolDefinition] | None = None, + tools: list[ToolDefinition | dict[str, Any]] | None = None, tool_choice: ToolChoice | None = None, response_format: dict[str, Any] | type[BaseModel] | None = None, api_version: str = NORMALIZED_API_VERSION, @@ -583,10 +583,15 @@ class Country(BaseModel): # Use provided dictionary format directly request_body["response_format"] = response_format - # Add tools if provided - convert to UiPath format + # Add tools if provided. A tool already in UiPath wire format (a dict) is + # passed through unchanged so callers can supply an arbitrary JSON schema + # for the parameters; ToolDefinition objects are converted as before. if tools: request_body["tools"] = [ - self._convert_tool_to_uipath_format(tool) for tool in tools + tool + if isinstance(tool, dict) + else self._convert_tool_to_uipath_format(tool) + for tool in tools ] # Handle tool_choice diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py index 124ccad8b..9e2292c60 100644 --- a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py +++ b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py @@ -7,6 +7,7 @@ from uipath.platform.chat import ( AutoToolChoice, ChatModels, + RequiredToolChoice, SpecificToolChoice, ToolDefinition, ToolFunctionDefinition, @@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service): assert result.choices[0].message.tool_calls[0].arguments["name"] == "John" assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234" + @pytest.mark.asyncio + @patch.object(UiPathLlmChatService, "request_async") + async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service): + """A tool supplied as a raw dict is sent unchanged, preserving nested schema. + + ToolDefinition's converter only emits flat properties, so callers that need + an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a + dict already in UiPath wire format. It must reach the gateway verbatim. + """ + mock_response = MagicMock() + mock_response.json.return_value = { + "id": "chatcmpl-raw", + "object": "chat.completion", + "created": 1677858242, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_raw", + "name": "submit_tool_response", + "arguments": {"response": {"items": [{"sku": "A1"}]}}, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "cache_read_input_tokens": None, + }, + } + mock_request.return_value = mock_response + + nested_tool = { + "name": "submit_tool_response", + "description": "Return the simulated response matching the schema.", + "parameters": { + "type": "object", + "properties": { + "response": { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": {"sku": {"type": "string"}}, + }, + } + }, + } + }, + "required": ["response"], + }, + } + + result = await llm_service.chat_completions( + messages=[{"role": "user", "content": "go"}], + model=ChatModels.gpt_4_1_mini_2025_04_14, + tools=[nested_tool], + tool_choice=RequiredToolChoice(), + ) + + mock_request.assert_called_once() + _, kwargs = mock_request.call_args + body = kwargs["json"] + # The dict tool is forwarded byte-for-byte, nested array schema intact. + assert body["tools"] == [nested_tool] + assert body["tool_choice"] == {"type": "required"} + assert result.choices[0].message.tool_calls[0].arguments == { + "response": {"items": [{"sku": "A1"}]} + } + @pytest.mark.asyncio @patch.object(UiPathLlmChatService, "request_async") async def test_chat_with_conversation_history_mocked( diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py index 57a727ec1..3c4daac51 100644 --- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py @@ -11,10 +11,12 @@ from uipath.platform import UiPath from uipath.platform.chat import UiPathLlmChatService from uipath.platform.chat._llm_gateway_service import ChatModels +from uipath.platform.chat.llm_gateway import RequiredToolChoice from .._execution_context import eval_set_run_id_context from ._mock_context import cache_manager_context from ._mocker import UiPathInputMockingError +from ._structured_output import build_response_tool, extract_response from ._types import ( InputMockingStrategy, ) @@ -105,14 +107,13 @@ async def generate_llm_input( prompt = get_input_mocking_prompt(**prompt_generation_args) - response_format = { - "type": "json_schema", - "json_schema": { - "name": "agent_input", - "strict": False, - "schema": input_schema, - }, - } + # Request structured output via function calling so it works across all + # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only + # honored for OpenAI models on the normalized gateway. + response_tool = build_response_tool( + input_schema, + description="Return the simulated agent input matching the required schema.", + ) model_parameters = mocking_strategy.model if mocking_strategy else None completion_kwargs = ( @@ -128,7 +129,7 @@ async def generate_llm_input( if cache_manager is not None: cache_key_data = { - "response_format": response_format, + "response_tool": response_tool, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -144,12 +145,12 @@ async def generate_llm_input( response = await llm.chat_completions( [{"role": "user", "content": prompt}], - response_format=response_format, + tools=[response_tool], + tool_choice=RequiredToolChoice(), **completion_kwargs, ) - generated_input_str = response.choices[0].message.content - result = json.loads(generated_input_str) + result = extract_response(response) if cache_manager is not None: cache_manager.set( @@ -160,10 +161,6 @@ async def generate_llm_input( ) return result - except json.JSONDecodeError as e: - raise UiPathInputMockingError( - f"Failed to parse LLM response as JSON: {str(e)}" - ) from e except UiPathInputMockingError: raise except Exception as e: diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py index d1fd2a1c9..ce932da11 100644 --- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py @@ -11,6 +11,7 @@ from uipath.platform import UiPath from uipath.platform.chat import UiPathLlmChatService from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema +from uipath.platform.chat.llm_gateway import RequiredToolChoice from .._execution_context import ( eval_set_run_id_context, @@ -28,6 +29,7 @@ UiPathMockResponseGenerationError, UiPathNoMockFoundError, ) +from ._structured_output import build_response_tool, extract_response from ._types import ( ExampleCall, LLMMockingStrategy, @@ -125,14 +127,16 @@ async def response( "output_schema", TypeAdapter(return_type).json_schema() ) - response_format = { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": _cleanup_schema(output_schema), - }, - } + # Request structured output via function calling so it works across + # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format + # is only honored for OpenAI models on the normalized gateway. + response_tool = build_response_tool( + _cleanup_schema(output_schema), + description=( + "Return the simulated response for tool " + f"'{function_name}' matching the required schema." + ), + ) try: # Safely pull examples from params. example_calls = params.get("example_calls", []) @@ -197,7 +201,7 @@ async def response( formatted_prompt = PROMPT.format(**prompt_generation_args) cache_key_data = { - "response_format": response_format, + "response_tool": response_tool, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -220,10 +224,11 @@ async def response( "content": formatted_prompt, }, ], - response_format=response_format, + tools=[response_tool], + tool_choice=RequiredToolChoice(), **completion_kwargs, ) - result = json.loads(response.choices[0].message.content) + result = extract_response(response) if cache_manager is not None: cache_manager.set( @@ -235,7 +240,7 @@ async def response( return result except Exception as e: - raise UiPathMockResponseGenerationError() from e + raise UiPathMockResponseGenerationError(str(e)) from e else: raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.") diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py new file mode 100644 index 000000000..424935190 --- /dev/null +++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py @@ -0,0 +1,70 @@ +"""Provider-agnostic structured output via LLM function calling. + +The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema) +only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock, +Gemini) return such requests with ``choices[0].message.content`` empty/None, +which breaks JSON parsing. Function calling is honored across all providers, so +the mockers request structured output as a forced tool call and read the result +from the tool call's parsed arguments. +""" + +from typing import Any + +RESPONSE_TOOL_NAME = "submit_tool_response" +RESPONSE_KEY = "response" + + +def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]: + """Build a normalized-API function tool that wraps ``schema`` under ``response``. + + Tool-call arguments are always a JSON object, so an arbitrary output schema + (which may be a scalar, array, or object) is nested under a single + ``response`` property and unwrapped after the call. + + Schemas from nested Pydantic models carry root ``$defs`` referenced by + ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the + parameters root, so ``$defs`` is hoisted there instead of being buried under + ``response`` (which would leave the references dangling). + """ + response_schema = dict(schema) + parameters: dict[str, Any] = { + "type": "object", + "properties": {RESPONSE_KEY: response_schema}, + "required": [RESPONSE_KEY], + } + defs = response_schema.pop("$defs", None) + if defs is not None: + parameters["$defs"] = defs + + return { + "name": RESPONSE_TOOL_NAME, + "description": description, + "parameters": parameters, + } + + +def extract_response(response: Any) -> Any: + """Extract the wrapped value from the forced tool call. + + Raises: + ValueError: if the response carries no usable tool call or is missing the + wrapped ``response`` key. + """ + choices = getattr(response, "choices", None) + if not choices: + raise ValueError("LLM response contained no choices") + + message = choices[0].message + tool_calls = getattr(message, "tool_calls", None) + if not tool_calls: + raise ValueError( + f"LLM response contained no tool calls (content={message.content!r})" + ) + + arguments = tool_calls[0].arguments + if RESPONSE_KEY not in arguments: + raise ValueError( + f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}" + ) + + return arguments[RESPONSE_KEY] diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py index 72b3765df..181a14b6a 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py @@ -83,10 +83,18 @@ async def test_generate_llm_input_with_model_settings( "index": 0, "message": { "role": "assistant", - "content": '{"query": "Calculate 5 times 7"}', - "tool_calls": None, + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": { + "response": {"query": "Calculate 5 times 7"} + }, + } + ], }, - "finish_reason": "stop", + "finish_reason": "tool_calls", } ], "usage": { @@ -112,3 +120,15 @@ async def test_generate_llm_input_with_model_settings( assert len(chat_completion_requests) == 1, ( "Expected exactly one chat completion request" ) + + # Structured output is requested via function calling (provider-agnostic), + # not via response_format which the gateway only honors for OpenAI models. + import json + + body = json.loads(chat_completion_requests[0].content.decode("utf-8")) + assert "response_format" not in body + assert body["tool_choice"] == {"type": "required"} + tools = body["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == input_schema diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py index 19a432fef..aeffa2d1e 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py @@ -57,10 +57,21 @@ async def test_simulate_input_span_attributes(httpx_mock: HTTPXMock, monkeypatch "index": 0, "message": { "role": "assistant", - "content": '{"name": "Alice", "greeting_style": "formal"}', - "tool_calls": None, + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": { + "response": { + "name": "Alice", + "greeting_style": "formal", + } + }, + } + ], }, - "finish_reason": "stop", + "finish_reason": "tool_calls", } ], "usage": { @@ -199,10 +210,17 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch): "index": 0, "message": { "role": "assistant", - "content": "invalid json{{{", # Invalid JSON - "tool_calls": None, + # Malformed: tool call is missing the wrapped "response" key + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {}, + } + ], }, - "finish_reason": "stop", + "finish_reason": "tool_calls", } ], "usage": { diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py index c4bc26ee3..521871d5f 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py +++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py @@ -569,11 +569,17 @@ def foofoo(*args, **kwargs): { "index": 0, "message": { - "role": "ai", - "content": '"bar1"', - "tool_calls": None, + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": "bar1"}, + } + ], }, - "finish_reason": "EOS", + "finish_reason": "tool_calls", } ], "usage": { @@ -599,14 +605,13 @@ def foofoo(*args, **kwargs): mock_request = httpx_mock.get_request(method="POST") assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert request["response_format"] == { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": {"type": "string"}, - }, - } + assert "response_format" not in request + assert request["tool_choice"] == {"type": "required"} + tools = request["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} + assert tools[0]["parameters"]["required"] == ["response"] with pytest.raises(NotImplementedError): assert foofoo() @@ -678,11 +683,17 @@ async def foofoo(*args, **kwargs): { "index": 0, "message": { - "role": "ai", - "content": '"bar1"', - "tool_calls": None, + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": "bar1"}, + } + ], }, - "finish_reason": "EOS", + "finish_reason": "tool_calls", } ], "usage": { @@ -708,14 +719,13 @@ async def foofoo(*args, **kwargs): mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert request["response_format"] == { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": {"type": "string"}, - }, - } + assert "response_format" not in request + assert request["tool_choice"] == {"type": "required"} + tools = request["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} + assert tools[0]["parameters"]["required"] == ["response"] with pytest.raises(NotImplementedError): assert await foofoo() @@ -786,11 +796,17 @@ def foo(*args, **kwargs) -> dict[str, Any]: { "index": 0, "message": { - "role": "ai", - "content": '{"content": "bar1"}', - "tool_calls": None, + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": {"content": "bar1"}}, + } + ], }, - "finish_reason": "EOS", + "finish_reason": "tool_calls", } ], "usage": { @@ -815,19 +831,18 @@ def foo(*args, **kwargs) -> dict[str, Any]: mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert request["response_format"] == { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": { - "required": ["content"], - "type": "object", - "additionalProperties": False, - "properties": {"content": {"type": "string"}}, - }, - }, + assert "response_format" not in request + assert request["tool_choice"] == {"type": "required"} + tools = request["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == { + "required": ["content"], + "type": "object", + "additionalProperties": False, + "properties": {"content": {"type": "string"}}, } + assert tools[0]["parameters"]["required"] == ["response"] @pytest.mark.asyncio @@ -887,11 +902,17 @@ async def foo(*args, **kwargs) -> dict[str, Any]: { "index": 0, "message": { - "role": "ai", - "content": '{"content": "bar1"}', - "tool_calls": None, + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": {"content": "bar1"}}, + } + ], }, - "finish_reason": "EOS", + "finish_reason": "tool_calls", } ], "usage": { @@ -916,19 +937,136 @@ async def foo(*args, **kwargs) -> dict[str, Any]: mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert request["response_format"] == { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": { - "required": ["content"], - "type": "object", - "additionalProperties": False, - "properties": {"content": {"type": "string"}}, - }, + assert "response_format" not in request + assert request["tool_choice"] == {"type": "required"} + tools = request["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == { + "required": ["content"], + "type": "object", + "additionalProperties": False, + "properties": {"content": {"type": "string"}}, + } + assert tools[0]["parameters"]["required"] == ["response"] + + +@pytest.mark.parametrize( + "model", + [ + "gpt-4.1-mini-2025-04-14", + "anthropic.claude-sonnet-4-5-20250929-v1:0", + "gemini-2.5-pro", + ], +) +@pytest.mark.asyncio +@pytest.mark.httpx_mock(assert_all_responses_were_requested=False) +async def test_llm_mockable_structured_output_via_tool_call( + model: str, httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch +): + """Tool simulation must work for all model providers (AE-1646). + + The mocker requests structured output via function calling and reads the + result from the forced tool call's arguments, so it does not depend on the + OpenAI-only ``choices[0].message.content`` shape. Non-OpenAI providers + (Claude/Bedrock, Gemini) return structured output through ``tool_calls`` with + ``content`` set to ``None``; that must not raise. + """ + monkeypatch.setenv("UIPATH_URL", "https://example.com") + monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890") + monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None) + monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None) + + @mockable() + async def foo(*args, **kwargs) -> str: + raise NotImplementedError() + + evaluation_item: dict[str, Any] = { + "id": "evaluation-id", + "name": "Mock foo", + "inputs": {}, + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, + "mockingStrategy": { + "type": "llm", + "prompt": "response is 'bar1'", + "toolsToSimulate": [{"name": "foo"}], + "model": {"model": model}, }, } + evaluation = EvaluationItem(**evaluation_item) + assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) + httpx_mock.add_response( + url="https://example.com/agenthub_/llm/api/capabilities", + status_code=200, + json={}, + ) + httpx_mock.add_response( + url="https://example.com/orchestrator_/llm/api/capabilities", + status_code=200, + json={}, + ) + + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={ + "id": "response-id", + "object": "", + "created": 0, + "model": model, + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": "bar1"}, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2, + }, + }, + ) + + set_execution_context( + MockingContext( + strategy=evaluation.mocking_strategy, + name=evaluation.name, + inputs=evaluation.inputs, + ), + _mock_span_collector, + "test-execution-id", + ) + + assert await foo() == "bar1" + + mock_request = httpx_mock.get_request(method="POST") + assert mock_request + request = json.loads(mock_request.content.decode("utf-8")) + # Structured output is requested via function calling, not response_format, + # so it works across all providers. + assert "response_format" not in request + assert request["tool_choice"] == {"type": "required"} + assert mock_request.headers["X-UiPath-LlmGateway-NormalizedApi-ModelName"] == model + tools = request["tools"] + assert len(tools) == 1 + assert tools[0]["name"] == "submit_tool_response" + assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} + assert tools[0]["parameters"]["required"] == ["response"] class TestUiPathMockRuntime: diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py new file mode 100644 index 000000000..5cb0fc1fb --- /dev/null +++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py @@ -0,0 +1,73 @@ +"""Unit tests for the provider-agnostic structured-output helpers.""" + +from types import SimpleNamespace + +import pytest + +from uipath.eval.mocks._structured_output import ( + RESPONSE_KEY, + RESPONSE_TOOL_NAME, + build_response_tool, + extract_response, +) + + +def _response(message: SimpleNamespace | None) -> SimpleNamespace: + choices = [] if message is None else [SimpleNamespace(message=message)] + return SimpleNamespace(choices=choices) + + +def test_build_response_tool_wraps_schema_under_response(): + tool = build_response_tool({"type": "string"}, description="desc") + assert tool["name"] == RESPONSE_TOOL_NAME + assert tool["description"] == "desc" + assert tool["parameters"]["properties"][RESPONSE_KEY] == {"type": "string"} + assert tool["parameters"]["required"] == [RESPONSE_KEY] + + +def test_build_response_tool_hoists_defs_to_root(): + # Nested Pydantic models emit root $defs + $ref. Wrapping the schema under + # "response" must hoist $defs to the tool-parameters root so "#/$defs/Item" + # still resolves; otherwise nested-model schemas are invalid. + item_def = {"type": "object", "properties": {"sku": {"type": "string"}}} + schema = { + "type": "object", + "properties": {"items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}}, + "$defs": {"Item": item_def}, + } + + tool = build_response_tool(schema, description="d") + params = tool["parameters"] + + assert params["$defs"] == {"Item": item_def} + assert "$defs" not in params["properties"][RESPONSE_KEY] + # the caller's schema dict is not mutated + assert "$defs" in schema + + +def test_extract_response_returns_wrapped_value(): + message = SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})], + ) + assert extract_response(_response(message)) == {"a": 1} + + +def test_extract_response_raises_when_no_choices(): + with pytest.raises(ValueError, match="no choices"): + extract_response(_response(None)) + + +def test_extract_response_raises_when_no_tool_calls(): + # Non-OpenAI text response without a tool call: surface a clear error. + message = SimpleNamespace(content="not a tool call", tool_calls=None) + with pytest.raises(ValueError, match="no tool calls"): + extract_response(_response(message)) + + +def test_extract_response_raises_when_response_key_missing(): + message = SimpleNamespace( + content=None, tool_calls=[SimpleNamespace(arguments={"other": 1})] + ) + with pytest.raises(ValueError, match=RESPONSE_KEY): + extract_response(_response(message)) From ab331990de97137395797704497c53b4f0fca125 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 29 May 2026 00:20:34 -0700 Subject: [PATCH 2/5] chore: bump uipath to 2.10.74 and uipath-platform to 0.1.60 The mocker fix in uipath depends on the dict-tool passthrough in uipath-platform, so uipath's lower-bound pin is raised to 0.1.60. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/uipath-platform/pyproject.toml | 2 +- packages/uipath-platform/uv.lock | 2 +- packages/uipath/pyproject.toml | 4 ++-- packages/uipath/uv.lock | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/uipath-platform/pyproject.toml b/packages/uipath-platform/pyproject.toml index 215882460..dfc759f4d 100644 --- a/packages/uipath-platform/pyproject.toml +++ b/packages/uipath-platform/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath-platform" -version = "0.1.59" +version = "0.1.60" description = "HTTP client library for programmatic access to UiPath Platform" readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath-platform/uv.lock b/packages/uipath-platform/uv.lock index dabbd63ad..084f3efb8 100644 --- a/packages/uipath-platform/uv.lock +++ b/packages/uipath-platform/uv.lock @@ -1095,7 +1095,7 @@ dev = [ [[package]] name = "uipath-platform" -version = "0.1.59" +version = "0.1.60" source = { editable = "." } dependencies = [ { name = "httpx" }, diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index a7a978f7c..21c22f8a8 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "uipath" -version = "2.10.73" +version = "2.10.74" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" dependencies = [ "uipath-core>=0.5.8, <0.6.0", "uipath-runtime>=0.10.1, <0.11.0", - "uipath-platform>=0.1.59, <0.2.0", + "uipath-platform>=0.1.60, <0.2.0", "click>=8.3.1", "httpx>=0.28.1", "pyjwt>=2.10.1", diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 3cf75bd40..365ed4d35 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.73" +version = "2.10.74" source = { editable = "." } dependencies = [ { name = "applicationinsights" }, @@ -2691,7 +2691,7 @@ dev = [ [[package]] name = "uipath-platform" -version = "0.1.59" +version = "0.1.60" source = { editable = "../uipath-platform" } dependencies = [ { name = "httpx" }, From 9a42ed38b5e70f93cce37f3080a53f779bf04b80 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 29 May 2026 00:29:12 -0700 Subject: [PATCH 3/5] fix(eval): inline $defs/$ref in tool schema for non-OpenAI tool simulation The normalized gateway accepts $ref/$defs in response_format but not inside a tool's parameters. Tool outputs typed as nested Pydantic models/enums (e.g. calculator's get_random_operator -> Wrapper[Operator]) produced a tool schema with $ref/$defs that the gateway rejected, so simulation failed. Inline the definitions into a self-contained schema (cyclic refs keep their $defs). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../uipath/eval/mocks/_structured_output.py | 60 +++++++++++++++---- .../cli/eval/mocks/test_structured_output.py | 49 ++++++++++++--- 2 files changed, 91 insertions(+), 18 deletions(-) diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py index 424935190..604553e1a 100644 --- a/packages/uipath/src/uipath/eval/mocks/_structured_output.py +++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py @@ -12,6 +12,50 @@ RESPONSE_TOOL_NAME = "submit_tool_response" RESPONSE_KEY = "response" +_DEFS_PREFIX = "#/$defs/" + + +def _inline_defs( + schema: dict[str, Any], +) -> tuple[dict[str, Any], dict[str, Any]]: + """Inline ``$defs``/``$ref`` into a self-contained schema. + + Nested Pydantic models and enums emit root ``$defs`` referenced by ``$ref``. + The normalized gateway accepts those in ``response_format`` but not inside a + tool's ``parameters``, so they are inlined here. Self-referential definitions + cannot be inlined without looping; any ``$ref`` reached while its target is + already on the current resolution path is left untouched and its definitions + are returned so the caller can keep them reachable. + + Returns: + A tuple of (inlined schema, leftover ``$defs`` needed for cyclic refs). + """ + defs = schema.get("$defs", {}) + leftover: dict[str, Any] = {} + + def resolve(node: Any, active: frozenset[str]) -> Any: + if isinstance(node, dict): + ref = node.get("$ref") + if isinstance(ref, str) and ref.startswith(_DEFS_PREFIX): + name = ref[len(_DEFS_PREFIX) :] + if name in defs and name not in active: + return resolve(defs[name], active | {name}) + # Cyclic or unknown ref: keep it and preserve its definition. + if name in defs: + leftover[name] = defs[name] + return dict(node) + return { + key: resolve(value, active) + for key, value in node.items() + if key != "$defs" + } + if isinstance(node, list): + return [resolve(item, active) for item in node] + return node + + root = {key: value for key, value in schema.items() if key != "$defs"} + inlined = resolve(root, frozenset()) + return inlined, leftover def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]: @@ -19,22 +63,18 @@ def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, A Tool-call arguments are always a JSON object, so an arbitrary output schema (which may be a scalar, array, or object) is nested under a single - ``response`` property and unwrapped after the call. - - Schemas from nested Pydantic models carry root ``$defs`` referenced by - ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the - parameters root, so ``$defs`` is hoisted there instead of being buried under - ``response`` (which would leave the references dangling). + ``response`` property and unwrapped after the call. ``$defs``/``$ref`` are + inlined so the tool parameters are self-contained, which the gateway requires + for tool schemas (unlike ``response_format``). """ - response_schema = dict(schema) + response_schema, leftover_defs = _inline_defs(schema) parameters: dict[str, Any] = { "type": "object", "properties": {RESPONSE_KEY: response_schema}, "required": [RESPONSE_KEY], } - defs = response_schema.pop("$defs", None) - if defs is not None: - parameters["$defs"] = defs + if leftover_defs: + parameters["$defs"] = leftover_defs return { "name": RESPONSE_TOOL_NAME, diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py index 5cb0fc1fb..c59a01a5b 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py +++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py @@ -1,5 +1,6 @@ """Unit tests for the provider-agnostic structured-output helpers.""" +import json from types import SimpleNamespace import pytest @@ -25,22 +26,54 @@ def test_build_response_tool_wraps_schema_under_response(): assert tool["parameters"]["required"] == [RESPONSE_KEY] -def test_build_response_tool_hoists_defs_to_root(): - # Nested Pydantic models emit root $defs + $ref. Wrapping the schema under - # "response" must hoist $defs to the tool-parameters root so "#/$defs/Item" - # still resolves; otherwise nested-model schemas are invalid. +def test_build_response_tool_inlines_refs_into_self_contained_schema(): + # Nested Pydantic models / enums emit $defs + $ref. The normalized gateway + # accepts $ref/$defs in response_format but NOT in a tool's parameters, so the + # schema must be inlined into a self-contained form (no $ref/$defs anywhere). + operator_def = {"enum": ["+", "-", "*", "/"], "type": "string"} item_def = {"type": "object", "properties": {"sku": {"type": "string"}}} schema = { "type": "object", - "properties": {"items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}}, - "$defs": {"Item": item_def}, + "properties": { + "operator": {"$ref": "#/$defs/Operator"}, + "items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}, + }, + "required": ["operator"], + "$defs": {"Operator": operator_def, "Item": item_def}, } tool = build_response_tool(schema, description="d") params = tool["parameters"] - assert params["$defs"] == {"Item": item_def} - assert "$defs" not in params["properties"][RESPONSE_KEY] + blob = json.dumps(params) + assert "$ref" not in blob + assert "$defs" not in blob + + response = params["properties"][RESPONSE_KEY] + assert response["properties"]["operator"] == operator_def + assert response["properties"]["items"]["items"] == item_def + # caller's schema is not mutated + assert "$defs" in schema + + +def test_build_response_tool_keeps_defs_for_cyclic_refs(): + # Self-referential schemas can't be fully inlined; keep $defs hoisted so the + # remaining $ref still resolves rather than infinite-looping. + node_def = { + "type": "object", + "properties": {"child": {"$ref": "#/$defs/Node"}}, + } + schema = { + "type": "object", + "properties": {"root": {"$ref": "#/$defs/Node"}}, + "$defs": {"Node": node_def}, + } + + tool = build_response_tool(schema, description="d") + params = tool["parameters"] + + assert "$defs" in params + assert "$ref" in json.dumps(params) # the caller's schema dict is not mutated assert "$defs" in schema From ae78cbe6ae5f6ba83c8f02dbaceec1fcd8667ac7 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 29 May 2026 01:06:03 -0700 Subject: [PATCH 4/5] fix(eval): prefer response_format with tool-call fallback for mockers All-tool-calling regressed OpenAI tool simulation (calculator-evals 'Test Random Addition Using LLM' became flaky: gpt_4_1_mini returned wrong/empty values for a nested-enum output schema via function calling, where response_format was reliable). Make structured-output generation adaptive: prefer response_format (honored reliably by OpenAI, native $defs support) and fall back to a forced tool call only when content comes back empty (the non-OpenAI failure mode, e.g. Claude/Bedrock). Shared in generate_structured_output(), used by both mockers. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/uipath/eval/mocks/_input_mocker.py | 25 +- .../src/uipath/eval/mocks/_llm_mocker.py | 37 +-- .../uipath/eval/mocks/_structured_output.py | 70 ++++- .../tests/cli/eval/mocks/test_input_mocker.py | 25 +- .../cli/eval/mocks/test_input_mocker_span.py | 30 +- .../uipath/tests/cli/eval/mocks/test_mocks.py | 268 ++++++++---------- .../cli/eval/mocks/test_structured_output.py | 88 ++++++ 7 files changed, 307 insertions(+), 236 deletions(-) diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py index 3c4daac51..a542fc7ad 100644 --- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py @@ -11,12 +11,11 @@ from uipath.platform import UiPath from uipath.platform.chat import UiPathLlmChatService from uipath.platform.chat._llm_gateway_service import ChatModels -from uipath.platform.chat.llm_gateway import RequiredToolChoice from .._execution_context import eval_set_run_id_context from ._mock_context import cache_manager_context from ._mocker import UiPathInputMockingError -from ._structured_output import build_response_tool, extract_response +from ._structured_output import generate_structured_output from ._types import ( InputMockingStrategy, ) @@ -107,14 +106,6 @@ async def generate_llm_input( prompt = get_input_mocking_prompt(**prompt_generation_args) - # Request structured output via function calling so it works across all - # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only - # honored for OpenAI models on the normalized gateway. - response_tool = build_response_tool( - input_schema, - description="Return the simulated agent input matching the required schema.", - ) - model_parameters = mocking_strategy.model if mocking_strategy else None completion_kwargs = ( model_parameters.model_dump(by_alias=False, exclude_none=True) @@ -129,7 +120,7 @@ async def generate_llm_input( if cache_manager is not None: cache_key_data = { - "response_tool": response_tool, + "input_schema": input_schema, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -143,15 +134,15 @@ async def generate_llm_input( if cached_response is not None: return cached_response - response = await llm.chat_completions( + result = await generate_structured_output( + llm, [{"role": "user", "content": prompt}], - tools=[response_tool], - tool_choice=RequiredToolChoice(), - **completion_kwargs, + schema=input_schema, + response_format_name="agent_input", + description="Return the simulated agent input matching the required schema.", + completion_kwargs=completion_kwargs, ) - result = extract_response(response) - if cache_manager is not None: cache_manager.set( mocker_type="input_mocker", diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py index ce932da11..a9ab7005e 100644 --- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py @@ -11,7 +11,6 @@ from uipath.platform import UiPath from uipath.platform.chat import UiPathLlmChatService from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema -from uipath.platform.chat.llm_gateway import RequiredToolChoice from .._execution_context import ( eval_set_run_id_context, @@ -29,7 +28,7 @@ UiPathMockResponseGenerationError, UiPathNoMockFoundError, ) -from ._structured_output import build_response_tool, extract_response +from ._structured_output import generate_structured_output from ._types import ( ExampleCall, LLMMockingStrategy, @@ -127,16 +126,7 @@ async def response( "output_schema", TypeAdapter(return_type).json_schema() ) - # Request structured output via function calling so it works across - # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format - # is only honored for OpenAI models on the normalized gateway. - response_tool = build_response_tool( - _cleanup_schema(output_schema), - description=( - "Return the simulated response for tool " - f"'{function_name}' matching the required schema." - ), - ) + cleaned_schema = _cleanup_schema(output_schema) try: # Safely pull examples from params. example_calls = params.get("example_calls", []) @@ -201,7 +191,7 @@ async def response( formatted_prompt = PROMPT.format(**prompt_generation_args) cache_key_data = { - "response_tool": response_tool, + "output_schema": cleaned_schema, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -217,18 +207,17 @@ async def response( if cached_response is not None: return cached_response - response = await llm.chat_completions( - [ - { - "role": "user", - "content": formatted_prompt, - }, - ], - tools=[response_tool], - tool_choice=RequiredToolChoice(), - **completion_kwargs, + result = await generate_structured_output( + llm, + [{"role": "user", "content": formatted_prompt}], + schema=cleaned_schema, + response_format_name="OutputSchema", + description=( + "Return the simulated response for tool " + f"'{function_name}' matching the required schema." + ), + completion_kwargs=completion_kwargs, ) - result = extract_response(response) if cache_manager is not None: cache_manager.set( diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py index 604553e1a..1f67565b9 100644 --- a/packages/uipath/src/uipath/eval/mocks/_structured_output.py +++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py @@ -1,19 +1,27 @@ -"""Provider-agnostic structured output via LLM function calling. +"""Provider-agnostic structured output for the eval mockers. The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema) -only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock, -Gemini) return such requests with ``choices[0].message.content`` empty/None, -which breaks JSON parsing. Function calling is honored across all providers, so -the mockers request structured output as a forced tool call and read the result -from the tool call's parsed arguments. +only for OpenAI models — and does so reliably, including native ``$defs`` +support. Non-OpenAI providers (Anthropic/Claude via Bedrock, Gemini) return such +requests with ``choices[0].message.content`` empty/None, which breaks JSON +parsing. Function calling is honored across providers but is less reliable for +OpenAI on some schemas, so it is used only as a fallback: prefer +``response_format`` and fall back to a forced tool call when the content comes +back empty. """ +import json +import logging from typing import Any +from uipath.platform.chat.llm_gateway import RequiredToolChoice + RESPONSE_TOOL_NAME = "submit_tool_response" RESPONSE_KEY = "response" _DEFS_PREFIX = "#/$defs/" +logger = logging.getLogger(__name__) + def _inline_defs( schema: dict[str, Any], @@ -108,3 +116,53 @@ def extract_response(response: Any) -> Any: ) return arguments[RESPONSE_KEY] + + +async def generate_structured_output( + llm: Any, + messages: list[dict[str, str]], + *, + schema: dict[str, Any], + response_format_name: str, + description: str, + completion_kwargs: dict[str, Any], +) -> Any: + """Generate structured output that works across all model providers. + + Prefers ``response_format`` (json_schema) — honored reliably by OpenAI with + native ``$defs`` support. When the provider returns empty content (the + non-OpenAI failure mode, e.g. Claude/Bedrock), falls back to a forced tool + call, which is honored across providers. + """ + response_format = { + "type": "json_schema", + "json_schema": { + "name": response_format_name, + "strict": False, + "schema": schema, + }, + } + + content: str | None = None + try: + rf_response = await llm.chat_completions( + messages, response_format=response_format, **completion_kwargs + ) + choices = getattr(rf_response, "choices", None) + if choices: + content = choices[0].message.content + except Exception as e: + # Some providers reject response_format outright; fall back to tools. + logger.info("response_format path failed, falling back to tools: %s", e) + + if content: + return json.loads(content) + + tool = build_response_tool(schema, description) + tc_response = await llm.chat_completions( + messages, + tools=[tool], + tool_choice=RequiredToolChoice(), + **completion_kwargs, + ) + return extract_response(tc_response) diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py index 181a14b6a..a8a8a64ec 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py @@ -83,18 +83,10 @@ async def test_generate_llm_input_with_model_settings( "index": 0, "message": { "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": { - "response": {"query": "Calculate 5 times 7"} - }, - } - ], + "content": '{"query": "Calculate 5 times 7"}', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "stop", } ], "usage": { @@ -121,14 +113,9 @@ async def test_generate_llm_input_with_model_settings( "Expected exactly one chat completion request" ) - # Structured output is requested via function calling (provider-agnostic), - # not via response_format which the gateway only honors for OpenAI models. + # OpenAI returns content via response_format; no tool-call fallback needed. import json body = json.loads(chat_completion_requests[0].content.decode("utf-8")) - assert "response_format" not in body - assert body["tool_choice"] == {"type": "required"} - tools = body["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == input_schema + assert "response_format" in body + assert "tools" not in body diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py index aeffa2d1e..19a432fef 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py @@ -57,21 +57,10 @@ async def test_simulate_input_span_attributes(httpx_mock: HTTPXMock, monkeypatch "index": 0, "message": { "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": { - "response": { - "name": "Alice", - "greeting_style": "formal", - } - }, - } - ], + "content": '{"name": "Alice", "greeting_style": "formal"}', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "stop", } ], "usage": { @@ -210,17 +199,10 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch): "index": 0, "message": { "role": "assistant", - # Malformed: tool call is missing the wrapped "response" key - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {}, - } - ], + "content": "invalid json{{{", # Invalid JSON + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "stop", } ], "usage": { diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py index 521871d5f..ab85deb96 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py +++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py @@ -569,17 +569,11 @@ def foofoo(*args, **kwargs): { "index": 0, "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {"response": "bar1"}, - } - ], + "role": "ai", + "content": '"bar1"', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "EOS", } ], "usage": { @@ -605,22 +599,25 @@ def foofoo(*args, **kwargs): mock_request = httpx_mock.get_request(method="POST") assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert "response_format" not in request - assert request["tool_choice"] == {"type": "required"} - tools = request["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} - assert tools[0]["parameters"]["required"] == ["response"] + assert request["response_format"] == { + "type": "json_schema", + "json_schema": { + "name": "OutputSchema", + "strict": False, + "schema": {"type": "string"}, + }, + } with pytest.raises(NotImplementedError): assert foofoo() - httpx_mock.add_response( - url="https://example.com/llm/api/chat/completions" - "?api-version=2024-08-01-preview", - status_code=200, - json={}, - ) + # Two empty responses: the response_format attempt and the tool-call fallback. + for _ in range(2): + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={}, + ) with pytest.raises(UiPathMockResponseGenerationError): assert foo() @@ -683,17 +680,11 @@ async def foofoo(*args, **kwargs): { "index": 0, "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {"response": "bar1"}, - } - ], + "role": "ai", + "content": '"bar1"', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "EOS", } ], "usage": { @@ -719,23 +710,26 @@ async def foofoo(*args, **kwargs): mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert "response_format" not in request - assert request["tool_choice"] == {"type": "required"} - tools = request["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} - assert tools[0]["parameters"]["required"] == ["response"] + assert request["response_format"] == { + "type": "json_schema", + "json_schema": { + "name": "OutputSchema", + "strict": False, + "schema": {"type": "string"}, + }, + } with pytest.raises(NotImplementedError): assert await foofoo() - httpx_mock.add_response( - url="https://example.com/llm/api/chat/completions" - "?api-version=2024-08-01-preview", - status_code=200, - json={}, - ) + # Two empty responses: the response_format attempt and the tool-call fallback. + for _ in range(2): + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={}, + ) with pytest.raises(UiPathMockResponseGenerationError): assert await foo() @@ -796,17 +790,11 @@ def foo(*args, **kwargs) -> dict[str, Any]: { "index": 0, "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {"response": {"content": "bar1"}}, - } - ], + "role": "ai", + "content": '{"content": "bar1"}', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "EOS", } ], "usage": { @@ -831,18 +819,19 @@ def foo(*args, **kwargs) -> dict[str, Any]: mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert "response_format" not in request - assert request["tool_choice"] == {"type": "required"} - tools = request["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == { - "required": ["content"], - "type": "object", - "additionalProperties": False, - "properties": {"content": {"type": "string"}}, + assert request["response_format"] == { + "type": "json_schema", + "json_schema": { + "name": "OutputSchema", + "strict": False, + "schema": { + "required": ["content"], + "type": "object", + "additionalProperties": False, + "properties": {"content": {"type": "string"}}, + }, + }, } - assert tools[0]["parameters"]["required"] == ["response"] @pytest.mark.asyncio @@ -902,17 +891,11 @@ async def foo(*args, **kwargs) -> dict[str, Any]: { "index": 0, "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {"response": {"content": "bar1"}}, - } - ], + "role": "ai", + "content": '{"content": "bar1"}', + "tool_calls": None, }, - "finish_reason": "tool_calls", + "finish_reason": "EOS", } ], "usage": { @@ -937,40 +920,31 @@ async def foo(*args, **kwargs) -> dict[str, Any]: mock_request = httpx_mock.get_request() assert mock_request request = json.loads(mock_request.content.decode("utf-8")) - assert "response_format" not in request - assert request["tool_choice"] == {"type": "required"} - tools = request["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == { - "required": ["content"], - "type": "object", - "additionalProperties": False, - "properties": {"content": {"type": "string"}}, + assert request["response_format"] == { + "type": "json_schema", + "json_schema": { + "name": "OutputSchema", + "strict": False, + "schema": { + "required": ["content"], + "type": "object", + "additionalProperties": False, + "properties": {"content": {"type": "string"}}, + }, + }, } - assert tools[0]["parameters"]["required"] == ["response"] -@pytest.mark.parametrize( - "model", - [ - "gpt-4.1-mini-2025-04-14", - "anthropic.claude-sonnet-4-5-20250929-v1:0", - "gemini-2.5-pro", - ], -) @pytest.mark.asyncio @pytest.mark.httpx_mock(assert_all_responses_were_requested=False) -async def test_llm_mockable_structured_output_via_tool_call( - model: str, httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch +async def test_llm_mockable_falls_back_to_tool_call_for_non_openai( + httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch ): - """Tool simulation must work for all model providers (AE-1646). + """Tool simulation works for non-OpenAI providers (AE-1646). - The mocker requests structured output via function calling and reads the - result from the forced tool call's arguments, so it does not depend on the - OpenAI-only ``choices[0].message.content`` shape. Non-OpenAI providers - (Claude/Bedrock, Gemini) return structured output through ``tool_calls`` with - ``content`` set to ``None``; that must not raise. + Non-OpenAI providers (Claude/Bedrock, Gemini) return ``response_format`` + requests with empty ``content``. The mocker must then fall back to function + calling and read the result from the forced tool call's arguments. """ monkeypatch.setenv("UIPATH_URL", "https://example.com") monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890") @@ -992,7 +966,7 @@ async def foo(*args, **kwargs) -> str: "type": "llm", "prompt": "response is 'bar1'", "toolsToSimulate": [{"name": "foo"}], - "model": {"model": model}, + "model": {"model": "anthropic.claude-sonnet-4-5-20250929-v1:0"}, }, } evaluation = EvaluationItem(**evaluation_item) @@ -1008,38 +982,41 @@ async def foo(*args, **kwargs) -> str: json={}, ) + def _completion(message: dict[str, Any]) -> dict[str, Any]: + return { + "id": "response-id", + "object": "", + "created": 0, + "model": "anthropic.claude-sonnet-4-5-20250929-v1:0", + "choices": [{"index": 0, "message": message, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}, + } + + # First call (response_format) returns empty content — the non-OpenAI failure. httpx_mock.add_response( url="https://example.com/llm/api/chat/completions" "?api-version=2024-08-01-preview", status_code=200, - json={ - "id": "response-id", - "object": "", - "created": 0, - "model": model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_1", - "name": "submit_tool_response", - "arguments": {"response": "bar1"}, - } - ], - }, - "finish_reason": "tool_calls", - } - ], - "usage": { - "prompt_tokens": 1, - "completion_tokens": 1, - "total_tokens": 2, - }, - }, + json=_completion({"role": "assistant", "content": None, "tool_calls": None}), + ) + # Fallback call (function calling) returns the structured result. + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json=_completion( + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": "bar1"}, + } + ], + } + ), ) set_execution_context( @@ -1054,19 +1031,18 @@ async def foo(*args, **kwargs) -> str: assert await foo() == "bar1" - mock_request = httpx_mock.get_request(method="POST") - assert mock_request - request = json.loads(mock_request.content.decode("utf-8")) - # Structured output is requested via function calling, not response_format, - # so it works across all providers. - assert "response_format" not in request - assert request["tool_choice"] == {"type": "required"} - assert mock_request.headers["X-UiPath-LlmGateway-NormalizedApi-ModelName"] == model - tools = request["tools"] - assert len(tools) == 1 - assert tools[0]["name"] == "submit_tool_response" - assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"} - assert tools[0]["parameters"]["required"] == ["response"] + requests = [ + r for r in httpx_mock.get_requests() if "chat/completions" in str(r.url) + ] + assert len(requests) == 2 + first = json.loads(requests[0].content.decode("utf-8")) + second = json.loads(requests[1].content.decode("utf-8")) + # First attempt uses response_format; fallback uses a forced tool call. + assert "response_format" in first + assert "tools" not in first + assert second["tool_choice"] == {"type": "required"} + assert second["tools"][0]["name"] == "submit_tool_response" + assert "response_format" not in second class TestUiPathMockRuntime: diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py index c59a01a5b..730db0449 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py +++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py @@ -10,6 +10,7 @@ RESPONSE_TOOL_NAME, build_response_tool, extract_response, + generate_structured_output, ) @@ -18,6 +19,21 @@ def _response(message: SimpleNamespace | None) -> SimpleNamespace: return SimpleNamespace(choices=choices) +class _FakeLLM: + """Records chat_completions calls and replays queued responses in order.""" + + def __init__(self, responses): + self._responses = list(responses) + self.calls: list[dict] = [] + + async def chat_completions(self, messages, **kwargs): + self.calls.append(kwargs) + nxt = self._responses.pop(0) + if isinstance(nxt, Exception): + raise nxt + return nxt + + def test_build_response_tool_wraps_schema_under_response(): tool = build_response_tool({"type": "string"}, description="desc") assert tool["name"] == RESPONSE_TOOL_NAME @@ -104,3 +120,75 @@ def test_extract_response_raises_when_response_key_missing(): ) with pytest.raises(ValueError, match=RESPONSE_KEY): extract_response(_response(message)) + + +@pytest.mark.asyncio +async def test_generate_structured_output_prefers_response_format_content(): + # OpenAI returns content via response_format; no fallback call is made. + llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))]) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 1 + assert "response_format" in llm.calls[0] + assert "tools" not in llm.calls[0] + + +@pytest.mark.asyncio +async def test_generate_structured_output_falls_back_on_empty_content(): + # Non-OpenAI: response_format yields empty content -> fall back to tool call. + llm = _FakeLLM( + [ + _response(SimpleNamespace(content=None, tool_calls=None)), + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})], + ) + ), + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 2 + assert "response_format" in llm.calls[0] + assert "tools" in llm.calls[1] and "tool_choice" in llm.calls[1] + + +@pytest.mark.asyncio +async def test_generate_structured_output_falls_back_when_response_format_raises(): + # A provider that rejects response_format outright still gets a tool fallback. + llm = _FakeLLM( + [ + RuntimeError("response_format unsupported"), + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})], + ) + ), + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "string"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == "ok" + assert len(llm.calls) == 2 From b4954bee90becf479b6f0815da0ec8eb8a3ceaca Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 29 May 2026 01:19:31 -0700 Subject: [PATCH 5/5] test(eval): add explicit type params to _FakeLLM for mypy Co-Authored-By: Claude Opus 4.8 (1M context) --- .../uipath/tests/cli/eval/mocks/test_structured_output.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py index 730db0449..a19e2605e 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py +++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py @@ -2,6 +2,7 @@ import json from types import SimpleNamespace +from typing import Any import pytest @@ -22,11 +23,11 @@ def _response(message: SimpleNamespace | None) -> SimpleNamespace: class _FakeLLM: """Records chat_completions calls and replays queued responses in order.""" - def __init__(self, responses): + def __init__(self, responses: list[Any]): self._responses = list(responses) - self.calls: list[dict] = [] + self.calls: list[dict[str, Any]] = [] - async def chat_completions(self, messages, **kwargs): + async def chat_completions(self, messages: Any, **kwargs: Any) -> Any: self.calls.append(kwargs) nxt = self._responses.pop(0) if isinstance(nxt, Exception):