diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
index ffe0bff99..bc89fd82a 100644
--- a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
+++ b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
@@ -401,7 +401,7 @@ async def chat_completions(
         presence_penalty: float = 0,
         top_p: float | None = 1,
         top_k: int | None = None,
-        tools: list[ToolDefinition] | None = None,
+        tools: list[ToolDefinition | dict[str, Any]] | None = None,
         tool_choice: ToolChoice | None = None,
         response_format: dict[str, Any] | type[BaseModel] | None = None,
         api_version: str = NORMALIZED_API_VERSION,
@@ -583,10 +583,15 @@ class Country(BaseModel):
                 # Use provided dictionary format directly
                 request_body["response_format"] = response_format
 
-        # Add tools if provided - convert to UiPath format
+        # Add tools if provided. A tool already in UiPath wire format (a dict) is
+        # passed through unchanged so callers can supply an arbitrary JSON schema
+        # for the parameters; ToolDefinition objects are converted as before.
         if tools:
             request_body["tools"] = [
-                self._convert_tool_to_uipath_format(tool) for tool in tools
+                tool
+                if isinstance(tool, dict)
+                else self._convert_tool_to_uipath_format(tool)
+                for tool in tools
             ]
 
         # Handle tool_choice
diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
index 124ccad8b..9e2292c60 100644
--- a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
+++ b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
@@ -7,6 +7,7 @@
 from uipath.platform.chat import (
     AutoToolChoice,
     ChatModels,
+    RequiredToolChoice,
     SpecificToolChoice,
     ToolDefinition,
     ToolFunctionDefinition,
@@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service):
         assert result.choices[0].message.tool_calls[0].arguments["name"] == "John"
         assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234"
 
+    @pytest.mark.asyncio
+    @patch.object(UiPathLlmChatService, "request_async")
+    async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service):
+        """A tool supplied as a raw dict is sent unchanged, preserving nested schema.
+
+        ToolDefinition's converter only emits flat properties, so callers that need
+        an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a
+        dict already in UiPath wire format. It must reach the gateway verbatim.
+        """
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "id": "chatcmpl-raw",
+            "object": "chat.completion",
+            "created": 1677858242,
+            "model": "gpt-4o-mini-2024-07-18",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_raw",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"items": [{"sku": "A1"}]}},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15,
+                "cache_read_input_tokens": None,
+            },
+        }
+        mock_request.return_value = mock_response
+
+        nested_tool = {
+            "name": "submit_tool_response",
+            "description": "Return the simulated response matching the schema.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "type": "object",
+                        "properties": {
+                            "items": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {"sku": {"type": "string"}},
+                                },
+                            }
+                        },
+                    }
+                },
+                "required": ["response"],
+            },
+        }
+
+        result = await llm_service.chat_completions(
+            messages=[{"role": "user", "content": "go"}],
+            model=ChatModels.gpt_4_1_mini_2025_04_14,
+            tools=[nested_tool],
+            tool_choice=RequiredToolChoice(),
+        )
+
+        mock_request.assert_called_once()
+        _, kwargs = mock_request.call_args
+        body = kwargs["json"]
+        # The dict tool is forwarded byte-for-byte, nested array schema intact.
+        assert body["tools"] == [nested_tool]
+        assert body["tool_choice"] == {"type": "required"}
+        assert result.choices[0].message.tool_calls[0].arguments == {
+            "response": {"items": [{"sku": "A1"}]}
+        }
+
     @pytest.mark.asyncio
     @patch.object(UiPathLlmChatService, "request_async")
     async def test_chat_with_conversation_history_mocked(
diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
index 57a727ec1..3c4daac51 100644
--- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -11,10 +11,12 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
 from ._mocker import UiPathInputMockingError
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     InputMockingStrategy,
 )
@@ -105,14 +107,13 @@ async def generate_llm_input(
 
         prompt = get_input_mocking_prompt(**prompt_generation_args)
 
-        response_format = {
-            "type": "json_schema",
-            "json_schema": {
-                "name": "agent_input",
-                "strict": False,
-                "schema": input_schema,
-            },
-        }
+        # Request structured output via function calling so it works across all
+        # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only
+        # honored for OpenAI models on the normalized gateway.
+        response_tool = build_response_tool(
+            input_schema,
+            description="Return the simulated agent input matching the required schema.",
+        )
 
         model_parameters = mocking_strategy.model if mocking_strategy else None
         completion_kwargs = (
@@ -128,7 +129,7 @@ async def generate_llm_input(
 
         if cache_manager is not None:
             cache_key_data = {
-                "response_format": response_format,
+                "response_tool": response_tool,
                 "completion_kwargs": completion_kwargs,
                 "prompt_generation_args": prompt_generation_args,
             }
@@ -144,12 +145,12 @@ async def generate_llm_input(
 
         response = await llm.chat_completions(
             [{"role": "user", "content": prompt}],
-            response_format=response_format,
+            tools=[response_tool],
+            tool_choice=RequiredToolChoice(),
             **completion_kwargs,
         )
 
-        generated_input_str = response.choices[0].message.content
-        result = json.loads(generated_input_str)
+        result = extract_response(response)
 
         if cache_manager is not None:
             cache_manager.set(
@@ -160,10 +161,6 @@ async def generate_llm_input(
             )
 
         return result
-    except json.JSONDecodeError as e:
-        raise UiPathInputMockingError(
-            f"Failed to parse LLM response as JSON: {str(e)}"
-        ) from e
     except UiPathInputMockingError:
         raise
     except Exception as e:
diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
index d1fd2a1c9..ce932da11 100644
--- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -11,6 +11,7 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import (
     eval_set_run_id_context,
@@ -28,6 +29,7 @@
     UiPathMockResponseGenerationError,
     UiPathNoMockFoundError,
 )
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     ExampleCall,
     LLMMockingStrategy,
@@ -125,14 +127,16 @@ async def response(
                 "output_schema", TypeAdapter(return_type).json_schema()
             )
 
-            response_format = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": "OutputSchema",
-                    "strict": False,
-                    "schema": _cleanup_schema(output_schema),
-                },
-            }
+            # Request structured output via function calling so it works across
+            # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format
+            # is only honored for OpenAI models on the normalized gateway.
+            response_tool = build_response_tool(
+                _cleanup_schema(output_schema),
+                description=(
+                    "Return the simulated response for tool "
+                    f"'{function_name}' matching the required schema."
+                ),
+            )
             try:
                 # Safely pull examples from params.
                 example_calls = params.get("example_calls", [])
@@ -197,7 +201,7 @@ async def response(
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
-                    "response_format": response_format,
+                    "response_tool": response_tool,
                     "completion_kwargs": completion_kwargs,
                     "prompt_generation_args": prompt_generation_args,
                 }
@@ -220,10 +224,11 @@ async def response(
                             "content": formatted_prompt,
                         },
                     ],
-                    response_format=response_format,
+                    tools=[response_tool],
+                    tool_choice=RequiredToolChoice(),
                     **completion_kwargs,
                 )
-                result = json.loads(response.choices[0].message.content)
+                result = extract_response(response)
 
                 if cache_manager is not None:
                     cache_manager.set(
@@ -235,7 +240,7 @@ async def response(
 
                 return result
             except Exception as e:
-                raise UiPathMockResponseGenerationError() from e
+                raise UiPathMockResponseGenerationError(str(e)) from e
         else:
             raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.")
 
diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
new file mode 100644
index 000000000..424935190
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -0,0 +1,70 @@
+"""Provider-agnostic structured output via LLM function calling.
+
+The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema)
+only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock,
+Gemini) return such requests with ``choices[0].message.content`` empty/None,
+which breaks JSON parsing. Function calling is honored across all providers, so
+the mockers request structured output as a forced tool call and read the result
+from the tool call's parsed arguments.
+"""
+
+from typing import Any
+
+RESPONSE_TOOL_NAME = "submit_tool_response"
+RESPONSE_KEY = "response"
+
+
+def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]:
+    """Build a normalized-API function tool that wraps ``schema`` under ``response``.
+
+    Tool-call arguments are always a JSON object, so an arbitrary output schema
+    (which may be a scalar, array, or object) is nested under a single
+    ``response`` property and unwrapped after the call.
+
+    Schemas from nested Pydantic models carry root ``$defs`` referenced by
+    ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the
+    parameters root, so ``$defs`` is hoisted there instead of being buried under
+    ``response`` (which would leave the references dangling).
+    """
+    response_schema = dict(schema)
+    parameters: dict[str, Any] = {
+        "type": "object",
+        "properties": {RESPONSE_KEY: response_schema},
+        "required": [RESPONSE_KEY],
+    }
+    defs = response_schema.pop("$defs", None)
+    if defs is not None:
+        parameters["$defs"] = defs
+
+    return {
+        "name": RESPONSE_TOOL_NAME,
+        "description": description,
+        "parameters": parameters,
+    }
+
+
+def extract_response(response: Any) -> Any:
+    """Extract the wrapped value from the forced tool call.
+
+    Raises:
+        ValueError: if the response carries no usable tool call or is missing the
+            wrapped ``response`` key.
+    """
+    choices = getattr(response, "choices", None)
+    if not choices:
+        raise ValueError("LLM response contained no choices")
+
+    message = choices[0].message
+    tool_calls = getattr(message, "tool_calls", None)
+    if not tool_calls:
+        raise ValueError(
+            f"LLM response contained no tool calls (content={message.content!r})"
+        )
+
+    arguments = tool_calls[0].arguments
+    if RESPONSE_KEY not in arguments:
+        raise ValueError(
+            f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}"
+        )
+
+    return arguments[RESPONSE_KEY]
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
index 72b3765df..181a14b6a 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
@@ -83,10 +83,18 @@ async def test_generate_llm_input_with_model_settings(
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": '{"query": "Calculate 5 times 7"}',
-                        "tool_calls": None,
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {
+                                    "response": {"query": "Calculate 5 times 7"}
+                                },
+                            }
+                        ],
                     },
-                    "finish_reason": "stop",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -112,3 +120,15 @@ async def test_generate_llm_input_with_model_settings(
     assert len(chat_completion_requests) == 1, (
         "Expected exactly one chat completion request"
     )
+
+    # Structured output is requested via function calling (provider-agnostic),
+    # not via response_format which the gateway only honors for OpenAI models.
+    import json
+
+    body = json.loads(chat_completion_requests[0].content.decode("utf-8"))
+    assert "response_format" not in body
+    assert body["tool_choice"] == {"type": "required"}
+    tools = body["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == input_schema
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
index 19a432fef..aeffa2d1e 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
@@ -57,10 +57,21 @@ async def test_simulate_input_span_attributes(httpx_mock: HTTPXMock, monkeypatch
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            "content": '{"name": "Alice", "greeting_style": "formal"}',
-                            "tool_calls": None,
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "id": "call_1",
+                                    "name": "submit_tool_response",
+                                    "arguments": {
+                                        "response": {
+                                            "name": "Alice",
+                                            "greeting_style": "formal",
+                                        }
+                                    },
+                                }
+                            ],
                         },
-                        "finish_reason": "stop",
+                        "finish_reason": "tool_calls",
                     }
                 ],
                 "usage": {
@@ -199,10 +210,17 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch):
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            "content": "invalid json{{{",  # Invalid JSON
-                            "tool_calls": None,
+                            # Malformed: tool call is missing the wrapped "response" key
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "id": "call_1",
+                                    "name": "submit_tool_response",
+                                    "arguments": {},
+                                }
+                            ],
                         },
-                        "finish_reason": "stop",
+                        "finish_reason": "tool_calls",
                     }
                 ],
                 "usage": {
diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
index c4bc26ee3..521871d5f 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
@@ -569,11 +569,17 @@ def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '"bar1"',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -599,14 +605,13 @@ def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request(method="POST")
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {"type": "string"},
-        },
-    }
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
     with pytest.raises(NotImplementedError):
         assert foofoo()
@@ -678,11 +683,17 @@ async def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '"bar1"',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -708,14 +719,13 @@ async def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {"type": "string"},
-        },
-    }
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
     with pytest.raises(NotImplementedError):
         assert await foofoo()
@@ -786,11 +796,17 @@ def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '{"content": "bar1"}',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"content": "bar1"}},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -815,19 +831,18 @@ def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {
-                "required": ["content"],
-                "type": "object",
-                "additionalProperties": False,
-                "properties": {"content": {"type": "string"}},
-            },
-        },
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {
+        "required": ["content"],
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {"content": {"type": "string"}},
     }
+    assert tools[0]["parameters"]["required"] == ["response"]
 
 
 @pytest.mark.asyncio
@@ -887,11 +902,17 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '{"content": "bar1"}',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"content": "bar1"}},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -916,19 +937,136 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {
-                "required": ["content"],
-                "type": "object",
-                "additionalProperties": False,
-                "properties": {"content": {"type": "string"}},
-            },
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {
+        "required": ["content"],
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {"content": {"type": "string"}},
+    }
+    assert tools[0]["parameters"]["required"] == ["response"]
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "gpt-4.1-mini-2025-04-14",
+        "anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "gemini-2.5-pro",
+    ],
+)
+@pytest.mark.asyncio
+@pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
+async def test_llm_mockable_structured_output_via_tool_call(
+    model: str, httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
+):
+    """Tool simulation must work for all model providers (AE-1646).
+
+    The mocker requests structured output via function calling and reads the
+    result from the forced tool call's arguments, so it does not depend on the
+    OpenAI-only ``choices[0].message.content`` shape. Non-OpenAI providers
+    (Claude/Bedrock, Gemini) return structured output through ``tool_calls`` with
+    ``content`` set to ``None``; that must not raise.
+    """
+    monkeypatch.setenv("UIPATH_URL", "https://example.com")
+    monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")
+    monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None)
+    monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None)
+
+    @mockable()
+    async def foo(*args, **kwargs) -> str:
+        raise NotImplementedError()
+
+    evaluation_item: dict[str, Any] = {
+        "id": "evaluation-id",
+        "name": "Mock foo",
+        "inputs": {},
+        "evaluationCriterias": {
+            "ExactMatchEvaluator": None,
+        },
+        "mockingStrategy": {
+            "type": "llm",
+            "prompt": "response is 'bar1'",
+            "toolsToSimulate": [{"name": "foo"}],
+            "model": {"model": model},
         },
     }
+    evaluation = EvaluationItem(**evaluation_item)
+    assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy)
+    httpx_mock.add_response(
+        url="https://example.com/agenthub_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+    httpx_mock.add_response(
+        url="https://example.com/orchestrator_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+
+    httpx_mock.add_response(
+        url="https://example.com/llm/api/chat/completions"
+        "?api-version=2024-08-01-preview",
+        status_code=200,
+        json={
+            "id": "response-id",
+            "object": "",
+            "created": 0,
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 1,
+                "completion_tokens": 1,
+                "total_tokens": 2,
+            },
+        },
+    )
+
+    set_execution_context(
+        MockingContext(
+            strategy=evaluation.mocking_strategy,
+            name=evaluation.name,
+            inputs=evaluation.inputs,
+        ),
+        _mock_span_collector,
+        "test-execution-id",
+    )
+
+    assert await foo() == "bar1"
+
+    mock_request = httpx_mock.get_request(method="POST")
+    assert mock_request
+    request = json.loads(mock_request.content.decode("utf-8"))
+    # Structured output is requested via function calling, not response_format,
+    # so it works across all providers.
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    assert mock_request.headers["X-UiPath-LlmGateway-NormalizedApi-ModelName"] == model
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
 
 class TestUiPathMockRuntime:
diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
new file mode 100644
index 000000000..5cb0fc1fb
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -0,0 +1,73 @@
+"""Unit tests for the provider-agnostic structured-output helpers."""
+
+from types import SimpleNamespace
+
+import pytest
+
+from uipath.eval.mocks._structured_output import (
+    RESPONSE_KEY,
+    RESPONSE_TOOL_NAME,
+    build_response_tool,
+    extract_response,
+)
+
+
+def _response(message: SimpleNamespace | None) -> SimpleNamespace:
+    choices = [] if message is None else [SimpleNamespace(message=message)]
+    return SimpleNamespace(choices=choices)
+
+
+def test_build_response_tool_wraps_schema_under_response():
+    tool = build_response_tool({"type": "string"}, description="desc")
+    assert tool["name"] == RESPONSE_TOOL_NAME
+    assert tool["description"] == "desc"
+    assert tool["parameters"]["properties"][RESPONSE_KEY] == {"type": "string"}
+    assert tool["parameters"]["required"] == [RESPONSE_KEY]
+
+
+def test_build_response_tool_hoists_defs_to_root():
+    # Nested Pydantic models emit root $defs + $ref. Wrapping the schema under
+    # "response" must hoist $defs to the tool-parameters root so "#/$defs/Item"
+    # still resolves; otherwise nested-model schemas are invalid.
+    item_def = {"type": "object", "properties": {"sku": {"type": "string"}}}
+    schema = {
+        "type": "object",
+        "properties": {"items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}},
+        "$defs": {"Item": item_def},
+    }
+
+    tool = build_response_tool(schema, description="d")
+    params = tool["parameters"]
+
+    assert params["$defs"] == {"Item": item_def}
+    assert "$defs" not in params["properties"][RESPONSE_KEY]
+    # the caller's schema dict is not mutated
+    assert "$defs" in schema
+
+
+def test_extract_response_returns_wrapped_value():
+    message = SimpleNamespace(
+        content=None,
+        tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+    )
+    assert extract_response(_response(message)) == {"a": 1}
+
+
+def test_extract_response_raises_when_no_choices():
+    with pytest.raises(ValueError, match="no choices"):
+        extract_response(_response(None))
+
+
+def test_extract_response_raises_when_no_tool_calls():
+    # Non-OpenAI text response without a tool call: surface a clear error.
+    message = SimpleNamespace(content="not a tool call", tool_calls=None)
+    with pytest.raises(ValueError, match="no tool calls"):
+        extract_response(_response(message))
+
+
+def test_extract_response_raises_when_response_key_missing():
+    message = SimpleNamespace(
+        content=None, tool_calls=[SimpleNamespace(arguments={"other": 1})]
+    )
+    with pytest.raises(ValueError, match=RESPONSE_KEY):
+        extract_response(_response(message))