UiPath · Chibionos · May 29, 2026
diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
@@ -401,7 +401,7 @@ async def chat_completions(
         presence_penalty: float = 0,
         top_p: float | None = 1,
         top_k: int | None = None,
-        tools: list[ToolDefinition] | None = None,
+        tools: list[ToolDefinition | dict[str, Any]] | None = None,
         tool_choice: ToolChoice | None = None,
         response_format: dict[str, Any] | type[BaseModel] | None = None,
         api_version: str = NORMALIZED_API_VERSION,
@@ -583,10 +583,15 @@ class Country(BaseModel):
                 # Use provided dictionary format directly
                 request_body["response_format"] = response_format
 
-        # Add tools if provided - convert to UiPath format
+        # Add tools if provided. A tool already in UiPath wire format (a dict) is
+        # passed through unchanged so callers can supply an arbitrary JSON schema
+        # for the parameters; ToolDefinition objects are converted as before.
         if tools:
             request_body["tools"] = [
-                self._convert_tool_to_uipath_format(tool) for tool in tools
+                tool
+                if isinstance(tool, dict)
+                else self._convert_tool_to_uipath_format(tool)
+                for tool in tools
             ]
 
         # Handle tool_choice

diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
@@ -7,6 +7,7 @@
 from uipath.platform.chat import (
     AutoToolChoice,
     ChatModels,
+    RequiredToolChoice,
     SpecificToolChoice,
     ToolDefinition,
     ToolFunctionDefinition,
@@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service):
         assert result.choices[0].message.tool_calls[0].arguments["name"] == "John"
         assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234"
 
+    @pytest.mark.asyncio
+    @patch.object(UiPathLlmChatService, "request_async")
+    async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service):
+        """A tool supplied as a raw dict is sent unchanged, preserving nested schema.
+
+        ToolDefinition's converter only emits flat properties, so callers that need
+        an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a
+        dict already in UiPath wire format. It must reach the gateway verbatim.
+        """
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "id": "chatcmpl-raw",
+            "object": "chat.completion",
+            "created": 1677858242,
+            "model": "gpt-4o-mini-2024-07-18",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_raw",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"items": [{"sku": "A1"}]}},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15,
+                "cache_read_input_tokens": None,
+            },
+        }
+        mock_request.return_value = mock_response
+
+        nested_tool = {
+            "name": "submit_tool_response",
+            "description": "Return the simulated response matching the schema.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "type": "object",
+                        "properties": {
+                            "items": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {"sku": {"type": "string"}},
+                                },
+                            }
+                        },
+                    }
+                },
+                "required": ["response"],
+            },
+        }
+
+        result = await llm_service.chat_completions(
+            messages=[{"role": "user", "content": "go"}],
+            model=ChatModels.gpt_4_1_mini_2025_04_14,
+            tools=[nested_tool],
+            tool_choice=RequiredToolChoice(),
+        )
+
+        mock_request.assert_called_once()
+        _, kwargs = mock_request.call_args
+        body = kwargs["json"]
+        # The dict tool is forwarded byte-for-byte, nested array schema intact.
+        assert body["tools"] == [nested_tool]
+        assert body["tool_choice"] == {"type": "required"}
+        assert result.choices[0].message.tool_calls[0].arguments == {
+            "response": {"items": [{"sku": "A1"}]}
+        }
+
     @pytest.mark.asyncio
     @patch.object(UiPathLlmChatService, "request_async")
     async def test_chat_with_conversation_history_mocked(

diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -11,10 +11,12 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
 from ._mocker import UiPathInputMockingError
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     InputMockingStrategy,
 )
@@ -105,14 +107,13 @@ async def generate_llm_input(
 
         prompt = get_input_mocking_prompt(**prompt_generation_args)
 
-        response_format = {
-            "type": "json_schema",
-            "json_schema": {
-                "name": "agent_input",
-                "strict": False,
-                "schema": input_schema,
-            },
-        }
+        # Request structured output via function calling so it works across all
+        # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only
+        # honored for OpenAI models on the normalized gateway.
+        response_tool = build_response_tool(
+            input_schema,
+            description="Return the simulated agent input matching the required schema.",
+        )
 
         model_parameters = mocking_strategy.model if mocking_strategy else None
         completion_kwargs = (
@@ -128,7 +129,7 @@ async def generate_llm_input(
 
         if cache_manager is not None:
             cache_key_data = {
-                "response_format": response_format,
+                "response_tool": response_tool,
                 "completion_kwargs": completion_kwargs,
                 "prompt_generation_args": prompt_generation_args,
             }
@@ -144,12 +145,12 @@ async def generate_llm_input(
 
         response = await llm.chat_completions(
             [{"role": "user", "content": prompt}],
-            response_format=response_format,
+            tools=[response_tool],
+            tool_choice=RequiredToolChoice(),
             **completion_kwargs,
         )
 
-        generated_input_str = response.choices[0].message.content
-        result = json.loads(generated_input_str)
+        result = extract_response(response)
 
         if cache_manager is not None:
             cache_manager.set(
@@ -160,10 +161,6 @@ async def generate_llm_input(
             )
 
         return result
-    except json.JSONDecodeError as e:
-        raise UiPathInputMockingError(
-            f"Failed to parse LLM response as JSON: {str(e)}"
-        ) from e
     except UiPathInputMockingError:
         raise
     except Exception as e:

diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -11,6 +11,7 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import (
     eval_set_run_id_context,
@@ -28,6 +29,7 @@
     UiPathMockResponseGenerationError,
     UiPathNoMockFoundError,
 )
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     ExampleCall,
     LLMMockingStrategy,
@@ -125,14 +127,16 @@ async def response(
                 "output_schema", TypeAdapter(return_type).json_schema()
             )
 
-            response_format = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": "OutputSchema",
-                    "strict": False,
-                    "schema": _cleanup_schema(output_schema),
-                },
-            }
+            # Request structured output via function calling so it works across
+            # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format
+            # is only honored for OpenAI models on the normalized gateway.
+            response_tool = build_response_tool(
+                _cleanup_schema(output_schema),
+                description=(
+                    "Return the simulated response for tool "
+                    f"'{function_name}' matching the required schema."
+                ),
+            )
             try:
                 # Safely pull examples from params.
                 example_calls = params.get("example_calls", [])
@@ -197,7 +201,7 @@ async def response(
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
-                    "response_format": response_format,
+                    "response_tool": response_tool,
                     "completion_kwargs": completion_kwargs,
                     "prompt_generation_args": prompt_generation_args,
                 }
@@ -220,10 +224,11 @@ async def response(
                             "content": formatted_prompt,
                         },
                     ],
-                    response_format=response_format,
+                    tools=[response_tool],
+                    tool_choice=RequiredToolChoice(),
                     **completion_kwargs,
                 )
-                result = json.loads(response.choices[0].message.content)
+                result = extract_response(response)
 
                 if cache_manager is not None:
                     cache_manager.set(
@@ -235,7 +240,7 @@ async def response(
 
                 return result
             except Exception as e:
-                raise UiPathMockResponseGenerationError() from e
+                raise UiPathMockResponseGenerationError(str(e)) from e
         else:
             raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.")
 

diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -0,0 +1,70 @@
+"""Provider-agnostic structured output via LLM function calling.
+
+The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema)
+only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock,
+Gemini) return such requests with ``choices[0].message.content`` empty/None,
+which breaks JSON parsing. Function calling is honored across all providers, so
+the mockers request structured output as a forced tool call and read the result
+from the tool call's parsed arguments.
+"""
+
+from typing import Any
+
+RESPONSE_TOOL_NAME = "submit_tool_response"
+RESPONSE_KEY = "response"
+
+
+def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]:
+    """Build a normalized-API function tool that wraps ``schema`` under ``response``.
+
+    Tool-call arguments are always a JSON object, so an arbitrary output schema
+    (which may be a scalar, array, or object) is nested under a single
+    ``response`` property and unwrapped after the call.
+
+    Schemas from nested Pydantic models carry root ``$defs`` referenced by
+    ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the
+    parameters root, so ``$defs`` is hoisted there instead of being buried under
+    ``response`` (which would leave the references dangling).
+    """
+    response_schema = dict(schema)
+    parameters: dict[str, Any] = {
+        "type": "object",
+        "properties": {RESPONSE_KEY: response_schema},
+        "required": [RESPONSE_KEY],
+    }
+    defs = response_schema.pop("$defs", None)
+    if defs is not None:
+        parameters["$defs"] = defs
+
+    return {
+        "name": RESPONSE_TOOL_NAME,
+        "description": description,
+        "parameters": parameters,
+    }
+
+
+def extract_response(response: Any) -> Any:
+    """Extract the wrapped value from the forced tool call.
+
+    Raises:
+        ValueError: if the response carries no usable tool call or is missing the
+            wrapped ``response`` key.
+    """
+    choices = getattr(response, "choices", None)
+    if not choices:
+        raise ValueError("LLM response contained no choices")
+
+    message = choices[0].message
+    tool_calls = getattr(message, "tool_calls", None)
+    if not tool_calls:
+        raise ValueError(
+            f"LLM response contained no tool calls (content={message.content!r})"
+        )
+
+    arguments = tool_calls[0].arguments
+    if RESPONSE_KEY not in arguments:
+        raise ValueError(
+            f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}"
+        )
+
+    return arguments[RESPONSE_KEY]
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
@@ -83,10 +83,18 @@ async def test_generate_llm_input_with_model_settings(
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": '{"query": "Calculate 5 times 7"}',
-                        "tool_calls": None,
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {
+                                    "response": {"query": "Calculate 5 times 7"}
+                                },
+                            }
+                        ],
                     },
-                    "finish_reason": "stop",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -112,3 +120,15 @@ async def test_generate_llm_input_with_model_settings(
     assert len(chat_completion_requests) == 1, (
         "Expected exactly one chat completion request"
     )
+
+    # Structured output is requested via function calling (provider-agnostic),
+    # not via response_format which the gateway only honors for OpenAI models.
+    import json
+
+    body = json.loads(chat_completion_requests[0].content.decode("utf-8"))
+    assert "response_format" not in body
+    assert body["tool_choice"] == {"type": "required"}
+    tools = body["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == input_schema