From f6bfa9c55a791cb4bc8f7030d433aff7eba29fb1 Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibicha@live.com>
Date: Thu, 28 May 2026 23:57:28 -0700
Subject: [PATCH 1/5] fix(eval): use function calling for tool/input mocking so
 non-OpenAI models work

Tool simulation and input generation in Studio Debug and Evaluation Set runs
failed with AGENT_RUNTIME.UNEXPECTED_ERROR for non-OpenAI models (Anthropic
Claude via Bedrock, Gemini). The mockers requested structured output via
OpenAI-only `response_format` json_schema and parsed `choices[0].message.content`;
for Claude that content is empty/None, so `json.loads(...)` raised.

Switch both mockers to provider-agnostic function calling (mirrors
llm_as_judge_evaluator): build a forced tool that wraps the output/input schema
under a `response` property, force it via tool_choice, and read
`tool_calls[0].arguments["response"]` (already a parsed dict). Hoist nested
`$defs` to the tool-parameters root so `$ref`s from nested Pydantic models still
resolve. The normalized LLM gateway now accepts raw-dict tools so arbitrary
nested schemas survive (the ToolDefinition converter only emits flat properties).

Regression introduced by #1555, which started routing the agent's model into
simulations; before that, simulation always used a fixed OpenAI model, so
non-OpenAI providers were never exercised on this path.

Fixes AE-1646.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../platform/chat/_llm_gateway_service.py     |  11 +-
 .../services/test_uipath_llm_integration.py   |  82 ++++++
 .../src/uipath/eval/mocks/_input_mocker.py    |  29 +-
 .../src/uipath/eval/mocks/_llm_mocker.py      |  29 +-
 .../uipath/eval/mocks/_structured_output.py   |  70 +++++
 .../tests/cli/eval/mocks/test_input_mocker.py |  26 +-
 .../cli/eval/mocks/test_input_mocker_span.py  |  30 ++-
 .../uipath/tests/cli/eval/mocks/test_mocks.py | 248 ++++++++++++++----
 .../cli/eval/mocks/test_structured_output.py  |  73 ++++++
 9 files changed, 503 insertions(+), 95 deletions(-)
 create mode 100644 packages/uipath/src/uipath/eval/mocks/_structured_output.py
 create mode 100644 packages/uipath/tests/cli/eval/mocks/test_structured_output.py

diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
index ffe0bff99..bc89fd82a 100644
--- a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
+++ b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
@@ -401,7 +401,7 @@ async def chat_completions(
         presence_penalty: float = 0,
         top_p: float | None = 1,
         top_k: int | None = None,
-        tools: list[ToolDefinition] | None = None,
+        tools: list[ToolDefinition | dict[str, Any]] | None = None,
         tool_choice: ToolChoice | None = None,
         response_format: dict[str, Any] | type[BaseModel] | None = None,
         api_version: str = NORMALIZED_API_VERSION,
@@ -583,10 +583,15 @@ class Country(BaseModel):
                 # Use provided dictionary format directly
                 request_body["response_format"] = response_format
 
-        # Add tools if provided - convert to UiPath format
+        # Add tools if provided. A tool already in UiPath wire format (a dict) is
+        # passed through unchanged so callers can supply an arbitrary JSON schema
+        # for the parameters; ToolDefinition objects are converted as before.
         if tools:
             request_body["tools"] = [
-                self._convert_tool_to_uipath_format(tool) for tool in tools
+                tool
+                if isinstance(tool, dict)
+                else self._convert_tool_to_uipath_format(tool)
+                for tool in tools
             ]
 
         # Handle tool_choice
diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
index 124ccad8b..9e2292c60 100644
--- a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
+++ b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
@@ -7,6 +7,7 @@
 from uipath.platform.chat import (
     AutoToolChoice,
     ChatModels,
+    RequiredToolChoice,
     SpecificToolChoice,
     ToolDefinition,
     ToolFunctionDefinition,
@@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service):
         assert result.choices[0].message.tool_calls[0].arguments["name"] == "John"
         assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234"
 
+    @pytest.mark.asyncio
+    @patch.object(UiPathLlmChatService, "request_async")
+    async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service):
+        """A tool supplied as a raw dict is sent unchanged, preserving nested schema.
+
+        ToolDefinition's converter only emits flat properties, so callers that need
+        an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a
+        dict already in UiPath wire format. It must reach the gateway verbatim.
+        """
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "id": "chatcmpl-raw",
+            "object": "chat.completion",
+            "created": 1677858242,
+            "model": "gpt-4o-mini-2024-07-18",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_raw",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"items": [{"sku": "A1"}]}},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15,
+                "cache_read_input_tokens": None,
+            },
+        }
+        mock_request.return_value = mock_response
+
+        nested_tool = {
+            "name": "submit_tool_response",
+            "description": "Return the simulated response matching the schema.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "type": "object",
+                        "properties": {
+                            "items": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {"sku": {"type": "string"}},
+                                },
+                            }
+                        },
+                    }
+                },
+                "required": ["response"],
+            },
+        }
+
+        result = await llm_service.chat_completions(
+            messages=[{"role": "user", "content": "go"}],
+            model=ChatModels.gpt_4_1_mini_2025_04_14,
+            tools=[nested_tool],
+            tool_choice=RequiredToolChoice(),
+        )
+
+        mock_request.assert_called_once()
+        _, kwargs = mock_request.call_args
+        body = kwargs["json"]
+        # The dict tool is forwarded byte-for-byte, nested array schema intact.
+        assert body["tools"] == [nested_tool]
+        assert body["tool_choice"] == {"type": "required"}
+        assert result.choices[0].message.tool_calls[0].arguments == {
+            "response": {"items": [{"sku": "A1"}]}
+        }
+
     @pytest.mark.asyncio
     @patch.object(UiPathLlmChatService, "request_async")
     async def test_chat_with_conversation_history_mocked(
diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
index 57a727ec1..3c4daac51 100644
--- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -11,10 +11,12 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
 from ._mocker import UiPathInputMockingError
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     InputMockingStrategy,
 )
@@ -105,14 +107,13 @@ async def generate_llm_input(
 
         prompt = get_input_mocking_prompt(**prompt_generation_args)
 
-        response_format = {
-            "type": "json_schema",
-            "json_schema": {
-                "name": "agent_input",
-                "strict": False,
-                "schema": input_schema,
-            },
-        }
+        # Request structured output via function calling so it works across all
+        # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only
+        # honored for OpenAI models on the normalized gateway.
+        response_tool = build_response_tool(
+            input_schema,
+            description="Return the simulated agent input matching the required schema.",
+        )
 
         model_parameters = mocking_strategy.model if mocking_strategy else None
         completion_kwargs = (
@@ -128,7 +129,7 @@ async def generate_llm_input(
 
         if cache_manager is not None:
             cache_key_data = {
-                "response_format": response_format,
+                "response_tool": response_tool,
                 "completion_kwargs": completion_kwargs,
                 "prompt_generation_args": prompt_generation_args,
             }
@@ -144,12 +145,12 @@ async def generate_llm_input(
 
         response = await llm.chat_completions(
             [{"role": "user", "content": prompt}],
-            response_format=response_format,
+            tools=[response_tool],
+            tool_choice=RequiredToolChoice(),
             **completion_kwargs,
         )
 
-        generated_input_str = response.choices[0].message.content
-        result = json.loads(generated_input_str)
+        result = extract_response(response)
 
         if cache_manager is not None:
             cache_manager.set(
@@ -160,10 +161,6 @@ async def generate_llm_input(
             )
 
         return result
-    except json.JSONDecodeError as e:
-        raise UiPathInputMockingError(
-            f"Failed to parse LLM response as JSON: {str(e)}"
-        ) from e
     except UiPathInputMockingError:
         raise
     except Exception as e:
diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
index d1fd2a1c9..ce932da11 100644
--- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -11,6 +11,7 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import (
     eval_set_run_id_context,
@@ -28,6 +29,7 @@
     UiPathMockResponseGenerationError,
     UiPathNoMockFoundError,
 )
+from ._structured_output import build_response_tool, extract_response
 from ._types import (
     ExampleCall,
     LLMMockingStrategy,
@@ -125,14 +127,16 @@ async def response(
                 "output_schema", TypeAdapter(return_type).json_schema()
             )
 
-            response_format = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": "OutputSchema",
-                    "strict": False,
-                    "schema": _cleanup_schema(output_schema),
-                },
-            }
+            # Request structured output via function calling so it works across
+            # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format
+            # is only honored for OpenAI models on the normalized gateway.
+            response_tool = build_response_tool(
+                _cleanup_schema(output_schema),
+                description=(
+                    "Return the simulated response for tool "
+                    f"'{function_name}' matching the required schema."
+                ),
+            )
             try:
                 # Safely pull examples from params.
                 example_calls = params.get("example_calls", [])
@@ -197,7 +201,7 @@ async def response(
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
-                    "response_format": response_format,
+                    "response_tool": response_tool,
                     "completion_kwargs": completion_kwargs,
                     "prompt_generation_args": prompt_generation_args,
                 }
@@ -220,10 +224,11 @@ async def response(
                             "content": formatted_prompt,
                         },
                     ],
-                    response_format=response_format,
+                    tools=[response_tool],
+                    tool_choice=RequiredToolChoice(),
                     **completion_kwargs,
                 )
-                result = json.loads(response.choices[0].message.content)
+                result = extract_response(response)
 
                 if cache_manager is not None:
                     cache_manager.set(
@@ -235,7 +240,7 @@ async def response(
 
                 return result
             except Exception as e:
-                raise UiPathMockResponseGenerationError() from e
+                raise UiPathMockResponseGenerationError(str(e)) from e
         else:
             raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.")
 
diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
new file mode 100644
index 000000000..424935190
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -0,0 +1,70 @@
+"""Provider-agnostic structured output via LLM function calling.
+
+The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema)
+only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock,
+Gemini) return such requests with ``choices[0].message.content`` empty/None,
+which breaks JSON parsing. Function calling is honored across all providers, so
+the mockers request structured output as a forced tool call and read the result
+from the tool call's parsed arguments.
+"""
+
+from typing import Any
+
+RESPONSE_TOOL_NAME = "submit_tool_response"
+RESPONSE_KEY = "response"
+
+
+def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]:
+    """Build a normalized-API function tool that wraps ``schema`` under ``response``.
+
+    Tool-call arguments are always a JSON object, so an arbitrary output schema
+    (which may be a scalar, array, or object) is nested under a single
+    ``response`` property and unwrapped after the call.
+
+    Schemas from nested Pydantic models carry root ``$defs`` referenced by
+    ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the
+    parameters root, so ``$defs`` is hoisted there instead of being buried under
+    ``response`` (which would leave the references dangling).
+    """
+    response_schema = dict(schema)
+    parameters: dict[str, Any] = {
+        "type": "object",
+        "properties": {RESPONSE_KEY: response_schema},
+        "required": [RESPONSE_KEY],
+    }
+    defs = response_schema.pop("$defs", None)
+    if defs is not None:
+        parameters["$defs"] = defs
+
+    return {
+        "name": RESPONSE_TOOL_NAME,
+        "description": description,
+        "parameters": parameters,
+    }
+
+
+def extract_response(response: Any) -> Any:
+    """Extract the wrapped value from the forced tool call.
+
+    Raises:
+        ValueError: if the response carries no usable tool call or is missing the
+            wrapped ``response`` key.
+    """
+    choices = getattr(response, "choices", None)
+    if not choices:
+        raise ValueError("LLM response contained no choices")
+
+    message = choices[0].message
+    tool_calls = getattr(message, "tool_calls", None)
+    if not tool_calls:
+        raise ValueError(
+            f"LLM response contained no tool calls (content={message.content!r})"
+        )
+
+    arguments = tool_calls[0].arguments
+    if RESPONSE_KEY not in arguments:
+        raise ValueError(
+            f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}"
+        )
+
+    return arguments[RESPONSE_KEY]
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
index 72b3765df..181a14b6a 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
@@ -83,10 +83,18 @@ async def test_generate_llm_input_with_model_settings(
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": '{"query": "Calculate 5 times 7"}',
-                        "tool_calls": None,
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {
+                                    "response": {"query": "Calculate 5 times 7"}
+                                },
+                            }
+                        ],
                     },
-                    "finish_reason": "stop",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -112,3 +120,15 @@ async def test_generate_llm_input_with_model_settings(
     assert len(chat_completion_requests) == 1, (
         "Expected exactly one chat completion request"
     )
+
+    # Structured output is requested via function calling (provider-agnostic),
+    # not via response_format which the gateway only honors for OpenAI models.
+    import json
+
+    body = json.loads(chat_completion_requests[0].content.decode("utf-8"))
+    assert "response_format" not in body
+    assert body["tool_choice"] == {"type": "required"}
+    tools = body["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == input_schema
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
index 19a432fef..aeffa2d1e 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
@@ -57,10 +57,21 @@ async def test_simulate_input_span_attributes(httpx_mock: HTTPXMock, monkeypatch
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            "content": '{"name": "Alice", "greeting_style": "formal"}',
-                            "tool_calls": None,
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "id": "call_1",
+                                    "name": "submit_tool_response",
+                                    "arguments": {
+                                        "response": {
+                                            "name": "Alice",
+                                            "greeting_style": "formal",
+                                        }
+                                    },
+                                }
+                            ],
                         },
-                        "finish_reason": "stop",
+                        "finish_reason": "tool_calls",
                     }
                 ],
                 "usage": {
@@ -199,10 +210,17 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch):
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            "content": "invalid json{{{",  # Invalid JSON
-                            "tool_calls": None,
+                            # Malformed: tool call is missing the wrapped "response" key
+                            "content": None,
+                            "tool_calls": [
+                                {
+                                    "id": "call_1",
+                                    "name": "submit_tool_response",
+                                    "arguments": {},
+                                }
+                            ],
                         },
-                        "finish_reason": "stop",
+                        "finish_reason": "tool_calls",
                     }
                 ],
                 "usage": {
diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
index c4bc26ee3..521871d5f 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
@@ -569,11 +569,17 @@ def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '"bar1"',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -599,14 +605,13 @@ def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request(method="POST")
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {"type": "string"},
-        },
-    }
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
     with pytest.raises(NotImplementedError):
         assert foofoo()
@@ -678,11 +683,17 @@ async def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '"bar1"',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -708,14 +719,13 @@ async def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {"type": "string"},
-        },
-    }
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
     with pytest.raises(NotImplementedError):
         assert await foofoo()
@@ -786,11 +796,17 @@ def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '{"content": "bar1"}',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"content": "bar1"}},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -815,19 +831,18 @@ def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {
-                "required": ["content"],
-                "type": "object",
-                "additionalProperties": False,
-                "properties": {"content": {"type": "string"}},
-            },
-        },
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {
+        "required": ["content"],
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {"content": {"type": "string"}},
     }
+    assert tools[0]["parameters"]["required"] == ["response"]
 
 
 @pytest.mark.asyncio
@@ -887,11 +902,17 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "ai",
-                        "content": '{"content": "bar1"}',
-                        "tool_calls": None,
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"content": "bar1"}},
+                            }
+                        ],
                     },
-                    "finish_reason": "EOS",
+                    "finish_reason": "tool_calls",
                 }
             ],
             "usage": {
@@ -916,19 +937,136 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert request["response_format"] == {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "OutputSchema",
-            "strict": False,
-            "schema": {
-                "required": ["content"],
-                "type": "object",
-                "additionalProperties": False,
-                "properties": {"content": {"type": "string"}},
-            },
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {
+        "required": ["content"],
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {"content": {"type": "string"}},
+    }
+    assert tools[0]["parameters"]["required"] == ["response"]
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "gpt-4.1-mini-2025-04-14",
+        "anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "gemini-2.5-pro",
+    ],
+)
+@pytest.mark.asyncio
+@pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
+async def test_llm_mockable_structured_output_via_tool_call(
+    model: str, httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
+):
+    """Tool simulation must work for all model providers (AE-1646).
+
+    The mocker requests structured output via function calling and reads the
+    result from the forced tool call's arguments, so it does not depend on the
+    OpenAI-only ``choices[0].message.content`` shape. Non-OpenAI providers
+    (Claude/Bedrock, Gemini) return structured output through ``tool_calls`` with
+    ``content`` set to ``None``; that must not raise.
+    """
+    monkeypatch.setenv("UIPATH_URL", "https://example.com")
+    monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")
+    monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None)
+    monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None)
+
+    @mockable()
+    async def foo(*args, **kwargs) -> str:
+        raise NotImplementedError()
+
+    evaluation_item: dict[str, Any] = {
+        "id": "evaluation-id",
+        "name": "Mock foo",
+        "inputs": {},
+        "evaluationCriterias": {
+            "ExactMatchEvaluator": None,
+        },
+        "mockingStrategy": {
+            "type": "llm",
+            "prompt": "response is 'bar1'",
+            "toolsToSimulate": [{"name": "foo"}],
+            "model": {"model": model},
         },
     }
+    evaluation = EvaluationItem(**evaluation_item)
+    assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy)
+    httpx_mock.add_response(
+        url="https://example.com/agenthub_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+    httpx_mock.add_response(
+        url="https://example.com/orchestrator_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+
+    httpx_mock.add_response(
+        url="https://example.com/llm/api/chat/completions"
+        "?api-version=2024-08-01-preview",
+        status_code=200,
+        json={
+            "id": "response-id",
+            "object": "",
+            "created": 0,
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": "bar1"},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 1,
+                "completion_tokens": 1,
+                "total_tokens": 2,
+            },
+        },
+    )
+
+    set_execution_context(
+        MockingContext(
+            strategy=evaluation.mocking_strategy,
+            name=evaluation.name,
+            inputs=evaluation.inputs,
+        ),
+        _mock_span_collector,
+        "test-execution-id",
+    )
+
+    assert await foo() == "bar1"
+
+    mock_request = httpx_mock.get_request(method="POST")
+    assert mock_request
+    request = json.loads(mock_request.content.decode("utf-8"))
+    # Structured output is requested via function calling, not response_format,
+    # so it works across all providers.
+    assert "response_format" not in request
+    assert request["tool_choice"] == {"type": "required"}
+    assert mock_request.headers["X-UiPath-LlmGateway-NormalizedApi-ModelName"] == model
+    tools = request["tools"]
+    assert len(tools) == 1
+    assert tools[0]["name"] == "submit_tool_response"
+    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
+    assert tools[0]["parameters"]["required"] == ["response"]
 
 
 class TestUiPathMockRuntime:
diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
new file mode 100644
index 000000000..5cb0fc1fb
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -0,0 +1,73 @@
+"""Unit tests for the provider-agnostic structured-output helpers."""
+
+from types import SimpleNamespace
+
+import pytest
+
+from uipath.eval.mocks._structured_output import (
+    RESPONSE_KEY,
+    RESPONSE_TOOL_NAME,
+    build_response_tool,
+    extract_response,
+)
+
+
+def _response(message: SimpleNamespace | None) -> SimpleNamespace:
+    choices = [] if message is None else [SimpleNamespace(message=message)]
+    return SimpleNamespace(choices=choices)
+
+
+def test_build_response_tool_wraps_schema_under_response():
+    tool = build_response_tool({"type": "string"}, description="desc")
+    assert tool["name"] == RESPONSE_TOOL_NAME
+    assert tool["description"] == "desc"
+    assert tool["parameters"]["properties"][RESPONSE_KEY] == {"type": "string"}
+    assert tool["parameters"]["required"] == [RESPONSE_KEY]
+
+
+def test_build_response_tool_hoists_defs_to_root():
+    # Nested Pydantic models emit root $defs + $ref. Wrapping the schema under
+    # "response" must hoist $defs to the tool-parameters root so "#/$defs/Item"
+    # still resolves; otherwise nested-model schemas are invalid.
+    item_def = {"type": "object", "properties": {"sku": {"type": "string"}}}
+    schema = {
+        "type": "object",
+        "properties": {"items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}},
+        "$defs": {"Item": item_def},
+    }
+
+    tool = build_response_tool(schema, description="d")
+    params = tool["parameters"]
+
+    assert params["$defs"] == {"Item": item_def}
+    assert "$defs" not in params["properties"][RESPONSE_KEY]
+    # the caller's schema dict is not mutated
+    assert "$defs" in schema
+
+
+def test_extract_response_returns_wrapped_value():
+    message = SimpleNamespace(
+        content=None,
+        tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+    )
+    assert extract_response(_response(message)) == {"a": 1}
+
+
+def test_extract_response_raises_when_no_choices():
+    with pytest.raises(ValueError, match="no choices"):
+        extract_response(_response(None))
+
+
+def test_extract_response_raises_when_no_tool_calls():
+    # Non-OpenAI text response without a tool call: surface a clear error.
+    message = SimpleNamespace(content="not a tool call", tool_calls=None)
+    with pytest.raises(ValueError, match="no tool calls"):
+        extract_response(_response(message))
+
+
+def test_extract_response_raises_when_response_key_missing():
+    message = SimpleNamespace(
+        content=None, tool_calls=[SimpleNamespace(arguments={"other": 1})]
+    )
+    with pytest.raises(ValueError, match=RESPONSE_KEY):
+        extract_response(_response(message))

From ab331990de97137395797704497c53b4f0fca125 Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibicha@live.com>
Date: Fri, 29 May 2026 00:20:34 -0700
Subject: [PATCH 2/5] chore: bump uipath to 2.10.74 and uipath-platform to
 0.1.60

The mocker fix in uipath depends on the dict-tool passthrough in
uipath-platform, so uipath's lower-bound pin is raised to 0.1.60.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/uipath-platform/pyproject.toml | 2 +-
 packages/uipath-platform/uv.lock        | 2 +-
 packages/uipath/pyproject.toml          | 4 ++--
 packages/uipath/uv.lock                 | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/uipath-platform/pyproject.toml b/packages/uipath-platform/pyproject.toml
index 215882460..dfc759f4d 100644
--- a/packages/uipath-platform/pyproject.toml
+++ b/packages/uipath-platform/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath-platform"
-version = "0.1.59"
+version = "0.1.60"
 description = "HTTP client library for programmatic access to UiPath Platform"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath-platform/uv.lock b/packages/uipath-platform/uv.lock
index dabbd63ad..084f3efb8 100644
--- a/packages/uipath-platform/uv.lock
+++ b/packages/uipath-platform/uv.lock
@@ -1095,7 +1095,7 @@ dev = [
 
 [[package]]
 name = "uipath-platform"
-version = "0.1.59"
+version = "0.1.60"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index a7a978f7c..21c22f8a8 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,13 +1,13 @@
 [project]
 name = "uipath"
-version = "2.10.73"
+version = "2.10.74"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
 dependencies = [
   "uipath-core>=0.5.8, <0.6.0",
   "uipath-runtime>=0.10.1, <0.11.0",
-  "uipath-platform>=0.1.59, <0.2.0",
+  "uipath-platform>=0.1.60, <0.2.0",
   "click>=8.3.1",
   "httpx>=0.28.1",
   "pyjwt>=2.10.1",
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 3cf75bd40..365ed4d35 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.10.73"
+version = "2.10.74"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },
@@ -2691,7 +2691,7 @@ dev = [
 
 [[package]]
 name = "uipath-platform"
-version = "0.1.59"
+version = "0.1.60"
 source = { editable = "../uipath-platform" }
 dependencies = [
     { name = "httpx" },

From 9a42ed38b5e70f93cce37f3080a53f779bf04b80 Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibicha@live.com>
Date: Fri, 29 May 2026 00:29:12 -0700
Subject: [PATCH 3/5] fix(eval): inline $defs/$ref in tool schema for
 non-OpenAI tool simulation

The normalized gateway accepts $ref/$defs in response_format but not inside a
tool's parameters. Tool outputs typed as nested Pydantic models/enums (e.g.
calculator's get_random_operator -> Wrapper[Operator]) produced a tool schema
with $ref/$defs that the gateway rejected, so simulation failed. Inline the
definitions into a self-contained schema (cyclic refs keep their $defs).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../uipath/eval/mocks/_structured_output.py   | 60 +++++++++++++++----
 .../cli/eval/mocks/test_structured_output.py  | 49 ++++++++++++---
 2 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
index 424935190..604553e1a 100644
--- a/packages/uipath/src/uipath/eval/mocks/_structured_output.py
+++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -12,6 +12,50 @@
 
 RESPONSE_TOOL_NAME = "submit_tool_response"
 RESPONSE_KEY = "response"
+_DEFS_PREFIX = "#/$defs/"
+
+
+def _inline_defs(
+    schema: dict[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any]]:
+    """Inline ``$defs``/``$ref`` into a self-contained schema.
+
+    Nested Pydantic models and enums emit root ``$defs`` referenced by ``$ref``.
+    The normalized gateway accepts those in ``response_format`` but not inside a
+    tool's ``parameters``, so they are inlined here. Self-referential definitions
+    cannot be inlined without looping; any ``$ref`` reached while its target is
+    already on the current resolution path is left untouched and its definitions
+    are returned so the caller can keep them reachable.
+
+    Returns:
+        A tuple of (inlined schema, leftover ``$defs`` needed for cyclic refs).
+    """
+    defs = schema.get("$defs", {})
+    leftover: dict[str, Any] = {}
+
+    def resolve(node: Any, active: frozenset[str]) -> Any:
+        if isinstance(node, dict):
+            ref = node.get("$ref")
+            if isinstance(ref, str) and ref.startswith(_DEFS_PREFIX):
+                name = ref[len(_DEFS_PREFIX) :]
+                if name in defs and name not in active:
+                    return resolve(defs[name], active | {name})
+                # Cyclic or unknown ref: keep it and preserve its definition.
+                if name in defs:
+                    leftover[name] = defs[name]
+                return dict(node)
+            return {
+                key: resolve(value, active)
+                for key, value in node.items()
+                if key != "$defs"
+            }
+        if isinstance(node, list):
+            return [resolve(item, active) for item in node]
+        return node
+
+    root = {key: value for key, value in schema.items() if key != "$defs"}
+    inlined = resolve(root, frozenset())
+    return inlined, leftover
 
 
 def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]:
@@ -19,22 +63,18 @@ def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, A
 
     Tool-call arguments are always a JSON object, so an arbitrary output schema
     (which may be a scalar, array, or object) is nested under a single
-    ``response`` property and unwrapped after the call.
-
-    Schemas from nested Pydantic models carry root ``$defs`` referenced by
-    ``$ref`` values like ``#/$defs/Item``. Those ``$ref`` paths resolve from the
-    parameters root, so ``$defs`` is hoisted there instead of being buried under
-    ``response`` (which would leave the references dangling).
+    ``response`` property and unwrapped after the call. ``$defs``/``$ref`` are
+    inlined so the tool parameters are self-contained, which the gateway requires
+    for tool schemas (unlike ``response_format``).
     """
-    response_schema = dict(schema)
+    response_schema, leftover_defs = _inline_defs(schema)
     parameters: dict[str, Any] = {
         "type": "object",
         "properties": {RESPONSE_KEY: response_schema},
         "required": [RESPONSE_KEY],
     }
-    defs = response_schema.pop("$defs", None)
-    if defs is not None:
-        parameters["$defs"] = defs
+    if leftover_defs:
+        parameters["$defs"] = leftover_defs
 
     return {
         "name": RESPONSE_TOOL_NAME,
diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
index 5cb0fc1fb..c59a01a5b 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -1,5 +1,6 @@
 """Unit tests for the provider-agnostic structured-output helpers."""
 
+import json
 from types import SimpleNamespace
 
 import pytest
@@ -25,22 +26,54 @@ def test_build_response_tool_wraps_schema_under_response():
     assert tool["parameters"]["required"] == [RESPONSE_KEY]
 
 
-def test_build_response_tool_hoists_defs_to_root():
-    # Nested Pydantic models emit root $defs + $ref. Wrapping the schema under
-    # "response" must hoist $defs to the tool-parameters root so "#/$defs/Item"
-    # still resolves; otherwise nested-model schemas are invalid.
+def test_build_response_tool_inlines_refs_into_self_contained_schema():
+    # Nested Pydantic models / enums emit $defs + $ref. The normalized gateway
+    # accepts $ref/$defs in response_format but NOT in a tool's parameters, so the
+    # schema must be inlined into a self-contained form (no $ref/$defs anywhere).
+    operator_def = {"enum": ["+", "-", "*", "/"], "type": "string"}
     item_def = {"type": "object", "properties": {"sku": {"type": "string"}}}
     schema = {
         "type": "object",
-        "properties": {"items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}},
-        "$defs": {"Item": item_def},
+        "properties": {
+            "operator": {"$ref": "#/$defs/Operator"},
+            "items": {"type": "array", "items": {"$ref": "#/$defs/Item"}},
+        },
+        "required": ["operator"],
+        "$defs": {"Operator": operator_def, "Item": item_def},
     }
 
     tool = build_response_tool(schema, description="d")
     params = tool["parameters"]
 
-    assert params["$defs"] == {"Item": item_def}
-    assert "$defs" not in params["properties"][RESPONSE_KEY]
+    blob = json.dumps(params)
+    assert "$ref" not in blob
+    assert "$defs" not in blob
+
+    response = params["properties"][RESPONSE_KEY]
+    assert response["properties"]["operator"] == operator_def
+    assert response["properties"]["items"]["items"] == item_def
+    # caller's schema is not mutated
+    assert "$defs" in schema
+
+
+def test_build_response_tool_keeps_defs_for_cyclic_refs():
+    # Self-referential schemas can't be fully inlined; keep $defs hoisted so the
+    # remaining $ref still resolves rather than infinite-looping.
+    node_def = {
+        "type": "object",
+        "properties": {"child": {"$ref": "#/$defs/Node"}},
+    }
+    schema = {
+        "type": "object",
+        "properties": {"root": {"$ref": "#/$defs/Node"}},
+        "$defs": {"Node": node_def},
+    }
+
+    tool = build_response_tool(schema, description="d")
+    params = tool["parameters"]
+
+    assert "$defs" in params
+    assert "$ref" in json.dumps(params)
     # the caller's schema dict is not mutated
     assert "$defs" in schema
 

From ae78cbe6ae5f6ba83c8f02dbaceec1fcd8667ac7 Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibicha@live.com>
Date: Fri, 29 May 2026 01:06:03 -0700
Subject: [PATCH 4/5] fix(eval): prefer response_format with tool-call fallback
 for mockers

All-tool-calling regressed OpenAI tool simulation (calculator-evals 'Test Random
Addition Using LLM' became flaky: gpt_4_1_mini returned wrong/empty values for a
nested-enum output schema via function calling, where response_format was
reliable). Make structured-output generation adaptive: prefer response_format
(honored reliably by OpenAI, native $defs support) and fall back to a forced
tool call only when content comes back empty (the non-OpenAI failure mode, e.g.
Claude/Bedrock). Shared in generate_structured_output(), used by both mockers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/uipath/eval/mocks/_input_mocker.py    |  25 +-
 .../src/uipath/eval/mocks/_llm_mocker.py      |  37 +--
 .../uipath/eval/mocks/_structured_output.py   |  70 ++++-
 .../tests/cli/eval/mocks/test_input_mocker.py |  25 +-
 .../cli/eval/mocks/test_input_mocker_span.py  |  30 +-
 .../uipath/tests/cli/eval/mocks/test_mocks.py | 268 ++++++++----------
 .../cli/eval/mocks/test_structured_output.py  |  88 ++++++
 7 files changed, 307 insertions(+), 236 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
index 3c4daac51..a542fc7ad 100644
--- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -11,12 +11,11 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels
-from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
 from ._mocker import UiPathInputMockingError
-from ._structured_output import build_response_tool, extract_response
+from ._structured_output import generate_structured_output
 from ._types import (
     InputMockingStrategy,
 )
@@ -107,14 +106,6 @@ async def generate_llm_input(
 
         prompt = get_input_mocking_prompt(**prompt_generation_args)
 
-        # Request structured output via function calling so it works across all
-        # model providers (OpenAI, Claude/Bedrock, Gemini); response_format is only
-        # honored for OpenAI models on the normalized gateway.
-        response_tool = build_response_tool(
-            input_schema,
-            description="Return the simulated agent input matching the required schema.",
-        )
-
         model_parameters = mocking_strategy.model if mocking_strategy else None
         completion_kwargs = (
             model_parameters.model_dump(by_alias=False, exclude_none=True)
@@ -129,7 +120,7 @@ async def generate_llm_input(
 
         if cache_manager is not None:
             cache_key_data = {
-                "response_tool": response_tool,
+                "input_schema": input_schema,
                 "completion_kwargs": completion_kwargs,
                 "prompt_generation_args": prompt_generation_args,
             }
@@ -143,15 +134,15 @@ async def generate_llm_input(
             if cached_response is not None:
                 return cached_response
 
-        response = await llm.chat_completions(
+        result = await generate_structured_output(
+            llm,
             [{"role": "user", "content": prompt}],
-            tools=[response_tool],
-            tool_choice=RequiredToolChoice(),
-            **completion_kwargs,
+            schema=input_schema,
+            response_format_name="agent_input",
+            description="Return the simulated agent input matching the required schema.",
+            completion_kwargs=completion_kwargs,
         )
 
-        result = extract_response(response)
-
         if cache_manager is not None:
             cache_manager.set(
                 mocker_type="input_mocker",
diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
index ce932da11..a9ab7005e 100644
--- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -11,7 +11,6 @@
 from uipath.platform import UiPath
 from uipath.platform.chat import UiPathLlmChatService
 from uipath.platform.chat._llm_gateway_service import ChatModels, _cleanup_schema
-from uipath.platform.chat.llm_gateway import RequiredToolChoice
 
 from .._execution_context import (
     eval_set_run_id_context,
@@ -29,7 +28,7 @@
     UiPathMockResponseGenerationError,
     UiPathNoMockFoundError,
 )
-from ._structured_output import build_response_tool, extract_response
+from ._structured_output import generate_structured_output
 from ._types import (
     ExampleCall,
     LLMMockingStrategy,
@@ -127,16 +126,7 @@ async def response(
                 "output_schema", TypeAdapter(return_type).json_schema()
             )
 
-            # Request structured output via function calling so it works across
-            # all model providers (OpenAI, Claude/Bedrock, Gemini); response_format
-            # is only honored for OpenAI models on the normalized gateway.
-            response_tool = build_response_tool(
-                _cleanup_schema(output_schema),
-                description=(
-                    "Return the simulated response for tool "
-                    f"'{function_name}' matching the required schema."
-                ),
-            )
+            cleaned_schema = _cleanup_schema(output_schema)
             try:
                 # Safely pull examples from params.
                 example_calls = params.get("example_calls", [])
@@ -201,7 +191,7 @@ async def response(
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
-                    "response_tool": response_tool,
+                    "output_schema": cleaned_schema,
                     "completion_kwargs": completion_kwargs,
                     "prompt_generation_args": prompt_generation_args,
                 }
@@ -217,18 +207,17 @@ async def response(
                     if cached_response is not None:
                         return cached_response
 
-                response = await llm.chat_completions(
-                    [
-                        {
-                            "role": "user",
-                            "content": formatted_prompt,
-                        },
-                    ],
-                    tools=[response_tool],
-                    tool_choice=RequiredToolChoice(),
-                    **completion_kwargs,
+                result = await generate_structured_output(
+                    llm,
+                    [{"role": "user", "content": formatted_prompt}],
+                    schema=cleaned_schema,
+                    response_format_name="OutputSchema",
+                    description=(
+                        "Return the simulated response for tool "
+                        f"'{function_name}' matching the required schema."
+                    ),
+                    completion_kwargs=completion_kwargs,
                 )
-                result = extract_response(response)
 
                 if cache_manager is not None:
                     cache_manager.set(
diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
index 604553e1a..1f67565b9 100644
--- a/packages/uipath/src/uipath/eval/mocks/_structured_output.py
+++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -1,19 +1,27 @@
-"""Provider-agnostic structured output via LLM function calling.
+"""Provider-agnostic structured output for the eval mockers.
 
 The normalized LLM Gateway honors OpenAI-style ``response_format`` (json_schema)
-only for OpenAI models. Non-OpenAI providers (Anthropic/Claude via Bedrock,
-Gemini) return such requests with ``choices[0].message.content`` empty/None,
-which breaks JSON parsing. Function calling is honored across all providers, so
-the mockers request structured output as a forced tool call and read the result
-from the tool call's parsed arguments.
+only for OpenAI models — and does so reliably, including native ``$defs``
+support. Non-OpenAI providers (Anthropic/Claude via Bedrock, Gemini) return such
+requests with ``choices[0].message.content`` empty/None, which breaks JSON
+parsing. Function calling is honored across providers but is less reliable for
+OpenAI on some schemas, so it is used only as a fallback: prefer
+``response_format`` and fall back to a forced tool call when the content comes
+back empty.
 """
 
+import json
+import logging
 from typing import Any
 
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
+
 RESPONSE_TOOL_NAME = "submit_tool_response"
 RESPONSE_KEY = "response"
 _DEFS_PREFIX = "#/$defs/"
 
+logger = logging.getLogger(__name__)
+
 
 def _inline_defs(
     schema: dict[str, Any],
@@ -108,3 +116,53 @@ def extract_response(response: Any) -> Any:
         )
 
     return arguments[RESPONSE_KEY]
+
+
+async def generate_structured_output(
+    llm: Any,
+    messages: list[dict[str, str]],
+    *,
+    schema: dict[str, Any],
+    response_format_name: str,
+    description: str,
+    completion_kwargs: dict[str, Any],
+) -> Any:
+    """Generate structured output that works across all model providers.
+
+    Prefers ``response_format`` (json_schema) — honored reliably by OpenAI with
+    native ``$defs`` support. When the provider returns empty content (the
+    non-OpenAI failure mode, e.g. Claude/Bedrock), falls back to a forced tool
+    call, which is honored across providers.
+    """
+    response_format = {
+        "type": "json_schema",
+        "json_schema": {
+            "name": response_format_name,
+            "strict": False,
+            "schema": schema,
+        },
+    }
+
+    content: str | None = None
+    try:
+        rf_response = await llm.chat_completions(
+            messages, response_format=response_format, **completion_kwargs
+        )
+        choices = getattr(rf_response, "choices", None)
+        if choices:
+            content = choices[0].message.content
+    except Exception as e:
+        # Some providers reject response_format outright; fall back to tools.
+        logger.info("response_format path failed, falling back to tools: %s", e)
+
+    if content:
+        return json.loads(content)
+
+    tool = build_response_tool(schema, description)
+    tc_response = await llm.chat_completions(
+        messages,
+        tools=[tool],
+        tool_choice=RequiredToolChoice(),
+        **completion_kwargs,
+    )
+    return extract_response(tc_response)
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
index 181a14b6a..a8a8a64ec 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
@@ -83,18 +83,10 @@ async def test_generate_llm_input_with_model_settings(
                     "index": 0,
                     "message": {
                         "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {
-                                    "response": {"query": "Calculate 5 times 7"}
-                                },
-                            }
-                        ],
+                        "content": '{"query": "Calculate 5 times 7"}',
+                        "tool_calls": None,
                     },
-                    "finish_reason": "tool_calls",
+                    "finish_reason": "stop",
                 }
             ],
             "usage": {
@@ -121,14 +113,9 @@ async def test_generate_llm_input_with_model_settings(
         "Expected exactly one chat completion request"
     )
 
-    # Structured output is requested via function calling (provider-agnostic),
-    # not via response_format which the gateway only honors for OpenAI models.
+    # OpenAI returns content via response_format; no tool-call fallback needed.
     import json
 
     body = json.loads(chat_completion_requests[0].content.decode("utf-8"))
-    assert "response_format" not in body
-    assert body["tool_choice"] == {"type": "required"}
-    tools = body["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == input_schema
+    assert "response_format" in body
+    assert "tools" not in body
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
index aeffa2d1e..19a432fef 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
@@ -57,21 +57,10 @@ async def test_simulate_input_span_attributes(httpx_mock: HTTPXMock, monkeypatch
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            "content": None,
-                            "tool_calls": [
-                                {
-                                    "id": "call_1",
-                                    "name": "submit_tool_response",
-                                    "arguments": {
-                                        "response": {
-                                            "name": "Alice",
-                                            "greeting_style": "formal",
-                                        }
-                                    },
-                                }
-                            ],
+                            "content": '{"name": "Alice", "greeting_style": "formal"}',
+                            "tool_calls": None,
                         },
-                        "finish_reason": "tool_calls",
+                        "finish_reason": "stop",
                     }
                 ],
                 "usage": {
@@ -210,17 +199,10 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch):
                         "index": 0,
                         "message": {
                             "role": "assistant",
-                            # Malformed: tool call is missing the wrapped "response" key
-                            "content": None,
-                            "tool_calls": [
-                                {
-                                    "id": "call_1",
-                                    "name": "submit_tool_response",
-                                    "arguments": {},
-                                }
-                            ],
+                            "content": "invalid json{{{",  # Invalid JSON
+                            "tool_calls": None,
                         },
-                        "finish_reason": "tool_calls",
+                        "finish_reason": "stop",
                     }
                 ],
                 "usage": {
diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
index 521871d5f..ab85deb96 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
@@ -569,17 +569,11 @@ def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {"response": "bar1"},
-                            }
-                        ],
+                        "role": "ai",
+                        "content": '"bar1"',
+                        "tool_calls": None,
                     },
-                    "finish_reason": "tool_calls",
+                    "finish_reason": "EOS",
                 }
             ],
             "usage": {
@@ -605,22 +599,25 @@ def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request(method="POST")
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert "response_format" not in request
-    assert request["tool_choice"] == {"type": "required"}
-    tools = request["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
-    assert tools[0]["parameters"]["required"] == ["response"]
+    assert request["response_format"] == {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "OutputSchema",
+            "strict": False,
+            "schema": {"type": "string"},
+        },
+    }
 
     with pytest.raises(NotImplementedError):
         assert foofoo()
-    httpx_mock.add_response(
-        url="https://example.com/llm/api/chat/completions"
-        "?api-version=2024-08-01-preview",
-        status_code=200,
-        json={},
-    )
+    # Two empty responses: the response_format attempt and the tool-call fallback.
+    for _ in range(2):
+        httpx_mock.add_response(
+            url="https://example.com/llm/api/chat/completions"
+            "?api-version=2024-08-01-preview",
+            status_code=200,
+            json={},
+        )
     with pytest.raises(UiPathMockResponseGenerationError):
         assert foo()
 
@@ -683,17 +680,11 @@ async def foofoo(*args, **kwargs):
                 {
                     "index": 0,
                     "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {"response": "bar1"},
-                            }
-                        ],
+                        "role": "ai",
+                        "content": '"bar1"',
+                        "tool_calls": None,
                     },
-                    "finish_reason": "tool_calls",
+                    "finish_reason": "EOS",
                 }
             ],
             "usage": {
@@ -719,23 +710,26 @@ async def foofoo(*args, **kwargs):
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert "response_format" not in request
-    assert request["tool_choice"] == {"type": "required"}
-    tools = request["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
-    assert tools[0]["parameters"]["required"] == ["response"]
+    assert request["response_format"] == {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "OutputSchema",
+            "strict": False,
+            "schema": {"type": "string"},
+        },
+    }
 
     with pytest.raises(NotImplementedError):
         assert await foofoo()
 
-    httpx_mock.add_response(
-        url="https://example.com/llm/api/chat/completions"
-        "?api-version=2024-08-01-preview",
-        status_code=200,
-        json={},
-    )
+    # Two empty responses: the response_format attempt and the tool-call fallback.
+    for _ in range(2):
+        httpx_mock.add_response(
+            url="https://example.com/llm/api/chat/completions"
+            "?api-version=2024-08-01-preview",
+            status_code=200,
+            json={},
+        )
     with pytest.raises(UiPathMockResponseGenerationError):
         assert await foo()
 
@@ -796,17 +790,11 @@ def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {"response": {"content": "bar1"}},
-                            }
-                        ],
+                        "role": "ai",
+                        "content": '{"content": "bar1"}',
+                        "tool_calls": None,
                     },
-                    "finish_reason": "tool_calls",
+                    "finish_reason": "EOS",
                 }
             ],
             "usage": {
@@ -831,18 +819,19 @@ def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert "response_format" not in request
-    assert request["tool_choice"] == {"type": "required"}
-    tools = request["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == {
-        "required": ["content"],
-        "type": "object",
-        "additionalProperties": False,
-        "properties": {"content": {"type": "string"}},
+    assert request["response_format"] == {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "OutputSchema",
+            "strict": False,
+            "schema": {
+                "required": ["content"],
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {"content": {"type": "string"}},
+            },
+        },
     }
-    assert tools[0]["parameters"]["required"] == ["response"]
 
 
 @pytest.mark.asyncio
@@ -902,17 +891,11 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
                 {
                     "index": 0,
                     "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {"response": {"content": "bar1"}},
-                            }
-                        ],
+                        "role": "ai",
+                        "content": '{"content": "bar1"}',
+                        "tool_calls": None,
                     },
-                    "finish_reason": "tool_calls",
+                    "finish_reason": "EOS",
                 }
             ],
             "usage": {
@@ -937,40 +920,31 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
     mock_request = httpx_mock.get_request()
     assert mock_request
     request = json.loads(mock_request.content.decode("utf-8"))
-    assert "response_format" not in request
-    assert request["tool_choice"] == {"type": "required"}
-    tools = request["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == {
-        "required": ["content"],
-        "type": "object",
-        "additionalProperties": False,
-        "properties": {"content": {"type": "string"}},
+    assert request["response_format"] == {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "OutputSchema",
+            "strict": False,
+            "schema": {
+                "required": ["content"],
+                "type": "object",
+                "additionalProperties": False,
+                "properties": {"content": {"type": "string"}},
+            },
+        },
     }
-    assert tools[0]["parameters"]["required"] == ["response"]
 
 
-@pytest.mark.parametrize(
-    "model",
-    [
-        "gpt-4.1-mini-2025-04-14",
-        "anthropic.claude-sonnet-4-5-20250929-v1:0",
-        "gemini-2.5-pro",
-    ],
-)
 @pytest.mark.asyncio
 @pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
-async def test_llm_mockable_structured_output_via_tool_call(
-    model: str, httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
+async def test_llm_mockable_falls_back_to_tool_call_for_non_openai(
+    httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
 ):
-    """Tool simulation must work for all model providers (AE-1646).
+    """Tool simulation works for non-OpenAI providers (AE-1646).
 
-    The mocker requests structured output via function calling and reads the
-    result from the forced tool call's arguments, so it does not depend on the
-    OpenAI-only ``choices[0].message.content`` shape. Non-OpenAI providers
-    (Claude/Bedrock, Gemini) return structured output through ``tool_calls`` with
-    ``content`` set to ``None``; that must not raise.
+    Non-OpenAI providers (Claude/Bedrock, Gemini) return ``response_format``
+    requests with empty ``content``. The mocker must then fall back to function
+    calling and read the result from the forced tool call's arguments.
     """
     monkeypatch.setenv("UIPATH_URL", "https://example.com")
     monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")
@@ -992,7 +966,7 @@ async def foo(*args, **kwargs) -> str:
             "type": "llm",
             "prompt": "response is 'bar1'",
             "toolsToSimulate": [{"name": "foo"}],
-            "model": {"model": model},
+            "model": {"model": "anthropic.claude-sonnet-4-5-20250929-v1:0"},
         },
     }
     evaluation = EvaluationItem(**evaluation_item)
@@ -1008,38 +982,41 @@ async def foo(*args, **kwargs) -> str:
         json={},
     )
 
+    def _completion(message: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "id": "response-id",
+            "object": "",
+            "created": 0,
+            "model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
+            "choices": [{"index": 0, "message": message, "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        }
+
+    # First call (response_format) returns empty content — the non-OpenAI failure.
     httpx_mock.add_response(
         url="https://example.com/llm/api/chat/completions"
         "?api-version=2024-08-01-preview",
         status_code=200,
-        json={
-            "id": "response-id",
-            "object": "",
-            "created": 0,
-            "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "name": "submit_tool_response",
-                                "arguments": {"response": "bar1"},
-                            }
-                        ],
-                    },
-                    "finish_reason": "tool_calls",
-                }
-            ],
-            "usage": {
-                "prompt_tokens": 1,
-                "completion_tokens": 1,
-                "total_tokens": 2,
-            },
-        },
+        json=_completion({"role": "assistant", "content": None, "tool_calls": None}),
+    )
+    # Fallback call (function calling) returns the structured result.
+    httpx_mock.add_response(
+        url="https://example.com/llm/api/chat/completions"
+        "?api-version=2024-08-01-preview",
+        status_code=200,
+        json=_completion(
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "name": "submit_tool_response",
+                        "arguments": {"response": "bar1"},
+                    }
+                ],
+            }
+        ),
     )
 
     set_execution_context(
@@ -1054,19 +1031,18 @@ async def foo(*args, **kwargs) -> str:
 
     assert await foo() == "bar1"
 
-    mock_request = httpx_mock.get_request(method="POST")
-    assert mock_request
-    request = json.loads(mock_request.content.decode("utf-8"))
-    # Structured output is requested via function calling, not response_format,
-    # so it works across all providers.
-    assert "response_format" not in request
-    assert request["tool_choice"] == {"type": "required"}
-    assert mock_request.headers["X-UiPath-LlmGateway-NormalizedApi-ModelName"] == model
-    tools = request["tools"]
-    assert len(tools) == 1
-    assert tools[0]["name"] == "submit_tool_response"
-    assert tools[0]["parameters"]["properties"]["response"] == {"type": "string"}
-    assert tools[0]["parameters"]["required"] == ["response"]
+    requests = [
+        r for r in httpx_mock.get_requests() if "chat/completions" in str(r.url)
+    ]
+    assert len(requests) == 2
+    first = json.loads(requests[0].content.decode("utf-8"))
+    second = json.loads(requests[1].content.decode("utf-8"))
+    # First attempt uses response_format; fallback uses a forced tool call.
+    assert "response_format" in first
+    assert "tools" not in first
+    assert second["tool_choice"] == {"type": "required"}
+    assert second["tools"][0]["name"] == "submit_tool_response"
+    assert "response_format" not in second
 
 
 class TestUiPathMockRuntime:
diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
index c59a01a5b..730db0449 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -10,6 +10,7 @@
     RESPONSE_TOOL_NAME,
     build_response_tool,
     extract_response,
+    generate_structured_output,
 )
 
 
@@ -18,6 +19,21 @@ def _response(message: SimpleNamespace | None) -> SimpleNamespace:
     return SimpleNamespace(choices=choices)
 
 
+class _FakeLLM:
+    """Records chat_completions calls and replays queued responses in order."""
+
+    def __init__(self, responses):
+        self._responses = list(responses)
+        self.calls: list[dict] = []
+
+    async def chat_completions(self, messages, **kwargs):
+        self.calls.append(kwargs)
+        nxt = self._responses.pop(0)
+        if isinstance(nxt, Exception):
+            raise nxt
+        return nxt
+
+
 def test_build_response_tool_wraps_schema_under_response():
     tool = build_response_tool({"type": "string"}, description="desc")
     assert tool["name"] == RESPONSE_TOOL_NAME
@@ -104,3 +120,75 @@ def test_extract_response_raises_when_response_key_missing():
     )
     with pytest.raises(ValueError, match=RESPONSE_KEY):
         extract_response(_response(message))
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_prefers_response_format_content():
+    # OpenAI returns content via response_format; no fallback call is made.
+    llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))])
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 1
+    assert "response_format" in llm.calls[0]
+    assert "tools" not in llm.calls[0]
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_falls_back_on_empty_content():
+    # Non-OpenAI: response_format yields empty content -> fall back to tool call.
+    llm = _FakeLLM(
+        [
+            _response(SimpleNamespace(content=None, tool_calls=None)),
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+                )
+            ),
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 2
+    assert "response_format" in llm.calls[0]
+    assert "tools" in llm.calls[1] and "tool_choice" in llm.calls[1]
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_falls_back_when_response_format_raises():
+    # A provider that rejects response_format outright still gets a tool fallback.
+    llm = _FakeLLM(
+        [
+            RuntimeError("response_format unsupported"),
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})],
+                )
+            ),
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "string"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == "ok"
+    assert len(llm.calls) == 2

From b4954bee90becf479b6f0815da0ec8eb8a3ceaca Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibicha@live.com>
Date: Fri, 29 May 2026 01:19:31 -0700
Subject: [PATCH 5/5] test(eval): add explicit type params to _FakeLLM for mypy

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../uipath/tests/cli/eval/mocks/test_structured_output.py  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
index 730db0449..a19e2605e 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -2,6 +2,7 @@
 
 import json
 from types import SimpleNamespace
+from typing import Any
 
 import pytest
 
@@ -22,11 +23,11 @@ def _response(message: SimpleNamespace | None) -> SimpleNamespace:
 class _FakeLLM:
     """Records chat_completions calls and replays queued responses in order."""
 
-    def __init__(self, responses):
+    def __init__(self, responses: list[Any]):
         self._responses = list(responses)
-        self.calls: list[dict] = []
+        self.calls: list[dict[str, Any]] = []
 
-    async def chat_completions(self, messages, **kwargs):
+    async def chat_completions(self, messages: Any, **kwargs: Any) -> Any:
         self.calls.append(kwargs)
         nxt = self._responses.pop(0)
         if isinstance(nxt, Exception):