From 390d0adb575279ce3e17b32e8d4ba02897fb448c Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Wed, 18 Mar 2026 15:21:23 +0100 Subject: [PATCH 01/17] initial work on code_mode --- .../packages/core/agent_framework/_agents.py | 7 +- .../core/agent_framework/_sessions.py | 6 +- .../packages/core/tests/core/test_agents.py | 63 +++ .../tools/code_mode_context_provider.py | 491 ++++++++++++++++++ .../samples/02-agents/tools/code_mode_tool.py | 347 +++++++++++++ 5 files changed, 910 insertions(+), 4 deletions(-) create mode 100644 python/samples/02-agents/tools/code_mode_context_provider.py create mode 100644 python/samples/02-agents/tools/code_mode_tool.py diff --git a/python/packages/core/agent_framework/_agents.py b/python/packages/core/agent_framework/_agents.py index 585898ae52..464d9bc21f 100644 --- a/python/packages/core/agent_framework/_agents.py +++ b/python/packages/core/agent_framework/_agents.py @@ -1191,11 +1191,14 @@ async def _prepare_run_context( options=opts, service_stores_history=bool(store_), ) + provider_options = dict(opts) + if tools_ is not None: + provider_options["tools"] = tools_ session_context, chat_options = await self._prepare_session_and_messages( session=active_session, input_messages=input_messages, - options=opts, + options=provider_options, ) default_additional_args = chat_options.pop("additional_function_arguments", None) if isinstance(default_additional_args, Mapping): @@ -1209,7 +1212,7 @@ async def _prepare_run_context( mcp_duplicate_message = "Tool names must be unique. Consider setting `tool_name_prefix` on the MCPTool." # Normalize tools - normalized_tools = normalize_tools(tools_) + normalized_tools = normalize_tools(session_context.options.get("tools", tools_)) # Resolve final tool list (configured tools + runtime provided tools + local MCP server tools) final_tools = list(base_tools) diff --git a/python/packages/core/agent_framework/_sessions.py b/python/packages/core/agent_framework/_sessions.py index 55d1a10a18..7754475e63 100644 --- a/python/packages/core/agent_framework/_sessions.py +++ b/python/packages/core/agent_framework/_sessions.py @@ -149,7 +149,8 @@ class SessionContext: middleware: Dict mapping source_id -> chat/function middleware added by that provider. Maintains insertion order (provider execution order). response: After invocation, contains the full AgentResponse, should not be changed. - options: Options passed to agent.run() - read-only, for reflection only. + options: Options passed to agent.run(). Providers can inspect these and may + update ``options["tools"]`` to influence per-run tool resolution. metadata: Shared metadata dictionary for cross-provider communication. """ @@ -176,7 +177,8 @@ def __init__( instructions: Pre-populated instructions. tools: Pre-populated tools. middleware: Pre-populated chat/function middleware by source. - options: Options from agent.run() - read-only for providers. + options: Options from agent.run(). Providers may inspect these and can + update ``options["tools"]`` to influence per-run tool resolution. metadata: Shared metadata for cross-provider communication. """ self.session_id = session_id diff --git a/python/packages/core/tests/core/test_agents.py b/python/packages/core/tests/core/test_agents.py index c7b3d7860c..ff57f960ce 100644 --- a/python/packages/core/tests/core/test_agents.py +++ b/python/packages/core/tests/core/test_agents.py @@ -841,6 +841,69 @@ async def test_per_service_call_persistence_rejects_existing_conversation_id_whe await agent.run("Hello", session=session, options={"store": False, "conversation_id": "existing_conversation"}) +async def test_context_provider_can_inspect_runtime_tools_from_run( + chat_client_base: SupportsChatGetResponse, +) -> None: + seen_tools: list[Any] = [] + + class RuntimeToolsProvider(BaseContextProvider): + def __init__(self) -> None: + super().__init__(source_id="runtime-tools") + + async def before_run(self, *, agent: Any, session: Any, context: Any, state: Any) -> None: + del agent, session, state + tools = context.options.get("tools", []) + seen_tools.extend(list(tools) if isinstance(tools, list) else [tools]) + + runtime_tool = FunctionTool(func=lambda: "runtime", name="runtime_tool", description="Runtime tool") + agent = Agent(client=chat_client_base, context_providers=[RuntimeToolsProvider()]) + + await agent._prepare_run_context( # type: ignore[reportPrivateUsage] + messages="Hello", + session=agent.create_session(), + tools=[runtime_tool], + options=None, + compaction_strategy=None, + tokenizer=None, + legacy_kwargs={}, + function_invocation_kwargs=None, + client_kwargs=None, + ) + + assert seen_tools == [runtime_tool] + + +async def test_context_provider_can_remove_runtime_tools_from_run( + chat_client_base: SupportsChatGetResponse, +) -> None: + class RuntimeToolsProvider(BaseContextProvider): + def __init__(self) -> None: + super().__init__(source_id="runtime-tools") + + async def before_run(self, *, agent: Any, session: Any, context: Any, state: Any) -> None: + del agent, session, state + context.options["tools"] = [] + + base_tool = FunctionTool(func=lambda: "base", name="base_tool", description="Base tool") + runtime_tool = FunctionTool(func=lambda: "runtime", name="runtime_tool", description="Runtime tool") + agent = Agent(client=chat_client_base, tools=[base_tool], context_providers=[RuntimeToolsProvider()]) + + ctx = await agent._prepare_run_context( # type: ignore[reportPrivateUsage] + messages="Hello", + session=agent.create_session(), + tools=[runtime_tool], + options=None, + compaction_strategy=None, + tokenizer=None, + legacy_kwargs={}, + function_invocation_kwargs=None, + client_kwargs=None, + ) + + tool_names = [_get_tool_name(tool_obj) for tool_obj in ctx["chat_options"]["tools"]] + assert tool_names == ["base_tool"] + + async def test_chat_client_agent_run_with_session(chat_client_base: SupportsChatGetResponse) -> None: mock_response = ChatResponse( messages=[Message(role="assistant", contents=[Content.from_text("test response")])], diff --git a/python/samples/02-agents/tools/code_mode_context_provider.py b/python/samples/02-agents/tools/code_mode_context_provider.py new file mode 100644 index 0000000000..2ce9ca5ff9 --- /dev/null +++ b/python/samples/02-agents/tools/code_mode_context_provider.py @@ -0,0 +1,491 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import asyncio +import json +import logging +import os +from collections.abc import Sequence +from pathlib import Path +from textwrap import indent +from typing import Annotated, Any, Literal + +from agent_framework import Agent, AgentSession, BaseContextProvider, Content, FunctionTool, SessionContext, tool +from agent_framework._tools import normalize_tools +from agent_framework.azure import AzureOpenAIResponsesClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +try: + from hyperlight_sandbox import WasmSandbox +except ModuleNotFoundError as exc: + raise RuntimeError( + "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " + "implementation. Install the provisional Hyperlight package once it " + "is available, or update this sample to match the final import path." + ) from exc + +load_dotenv() + +logger = logging.getLogger(__name__) + + +"""This sample demonstrates a ContextProvider-driven Hyperlight code-mode prototype. + +The provider owns sandbox lifecycle, discovers tools from the agent, +and injects dynamic instructions plus a single `execute_code` tool at run time. + +Tools passed to `agent.run(..., tools=...)` are available in +`context.options["tools"]`, so the provider can merge them into the sandbox and, +when configured, remove them from the model-facing run options too. +""" + + +def collect_tools(*tool_groups: Any) -> list[FunctionTool]: + """Normalize and collect unique ``FunctionTool`` instances, excluding execute_code.""" + + tools: list[FunctionTool] = [] + seen_names: set[str] = set() + + for tool_group in tool_groups: + normalized_group: Sequence[Any] + if ( + isinstance(tool_group, Sequence) + and not isinstance(tool_group, (str, bytes, bytearray)) + and all(isinstance(tool_obj, FunctionTool) for tool_obj in tool_group) + ): + normalized_group = tool_group + else: + normalized_group = normalize_tools(tool_group) + + for tool_obj in normalized_group: + if not isinstance(tool_obj, FunctionTool): + continue + + name = tool_obj.name + if name == "execute_code" or name in seen_names: + continue + + seen_names.add(name) + tools.append(tool_obj) + + return tools + + +def _resolve_execute_code_approval_mode( + *, + base_approval_mode: Literal["always_require", "never_require"] | None, + tools: Sequence[FunctionTool], +) -> Literal["always_require", "never_require"]: + """Return the strictest approval mode needed for execute_code.""" + + if base_approval_mode == "always_require": + return "always_require" + + if any(tool_obj.approval_mode == "always_require" for tool_obj in tools): + return "always_require" + + return "never_require" + + +def _tool_signature(tools: Sequence[FunctionTool]) -> tuple[tuple[str, int], ...]: + """Build a stable signature for a normalized tool sequence.""" + + return tuple((tool_obj.name, id(tool_obj)) for tool_obj in tools) + + +def _build_code_mode_instructions( + *, + tools: Sequence[FunctionTool], + tools_visible_to_model: bool, +) -> str: + """Build dynamic code-mode instructions for the discovered tools.""" + + if tools: + tools_descriptions = "\n\n".join([ + f"- `{tool_obj.name}`\n" + f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" + " Parameters:\n" + f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" + for tool_obj in tools + ]) + else: + tools_descriptions = "- No tools are currently registered inside the sandbox." + + visibility_note = ( + "Some tools listed below may also appear as normal tools, but you should still prefer " + "execute_code and call them from inside the sandbox. Only if you want to run just that single tool " + "can you use it directly." + if tools_visible_to_model + else "The tools listed below are registered inside the sandbox even if they do not appear as " + "normal tools. Access them through execute_code with call_tool(...)." + ) + + return f"""You have one primary tool: execute_code. + +It runs Python in an isolated Hyperlight Wasm sandbox. You do NOT have direct +access to data. The ONLY way to fetch data or perform computations is by +writing Python code via execute_code that calls `call_tool()` inside the +sandbox. + +`call_tool` is a built-in global inside the sandbox. No import is needed. + +{visibility_note} + +Available sandbox tools: +{tools_descriptions} + +Correct usage: +result = call_tool("tool_name", keyword=value) + +You can combine multiple call_tool(...) calls with regular Python code in the +same execute_code block, including loops, conditionals, variables, and +post-processing of tool results. + +Wrong usage: +call_tool("tool_name", {{"keyword": "value"}}) + +Do NOT hardcode data that should come from call_tool(...). +Prefer one execute_code call per request when possible. +Always include the complete stdout from execute_code in your final answer. +""" + + +def _create_wasm_sandbox(*, module_path: Path) -> Any: + """Create the provisional Hyperlight Wasm sandbox instance.""" + + try: + from hyperlight_sandbox import WasmSandbox + except ModuleNotFoundError as exc: + raise RuntimeError( + "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " + "implementation. Install the provisional Hyperlight package once it " + "is available, or update this sample to match the final import path." + ) from exc + if not module_path.exists(): + raise RuntimeError( + "Hyperlight Wasm module not found.\n" + f" module: {module_path} (MISSING)\n" + "Build the provisional python-sandbox AOT module first, or set " + "HYPERLIGHT_MODULE to the correct path." + ) + + return WasmSandbox(module_path=str(module_path)) + + +@tool(approval_mode="never_require") +def compute( + operation: Annotated[str, "Math operation: add, subtract, multiply, or divide."], + a: Annotated[float, "First numeric operand."], + b: Annotated[float, "Second numeric operand."], +) -> float: + """Perform a math operation used by sandbox code.""" + + operations = { + "add": a + b, + "subtract": a - b, + "multiply": a * b, + "divide": a / b if b else float("inf"), + } + return operations.get(operation, 0.0) + + +@tool(approval_mode="never_require") +def fetch_data( + table: Annotated[str, "Name of the simulated table to query."], +) -> list[dict[str, Any]]: + """Fetch simulated records from a named table.""" + + return { + "users": [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "admin"}, + ], + "products": [ + {"id": 101, "name": "Widget", "price": 9.99}, + {"id": 102, "name": "Gadget", "price": 19.99}, + ], + }.get(table, []) + + +class CodeModeContextProvider(BaseContextProvider): + """Inject a code-mode surface using agent-configured tools.""" + + DEFAULT_SOURCE_ID = "code_mode_provider" + + def __init__( + self, + source_id: str = DEFAULT_SOURCE_ID, + *, + tools: Sequence[FunctionTool] | None = None, + remove_tools_from_agent: bool = True, + approval_mode: Literal["always_require", "never_require"] | None = None, + ) -> None: + """Initialize the provider. + + Args: + source_id: Unique provider source identifier. + + Keyword Args: + tools: Additional sandbox-managed tools owned by the provider. + These are available through ``call_tool(...)`` inside + ``execute_code`` and are never surfaced to the model as + separate tools. + remove_tools_from_agent: When True, remove the + tools from the model-facing tool list after the provider + captures them, including tools passed at run time. + approval_mode: Base approval mode for the provider-managed + `execute_code` tool. The effective mode is upgraded to the + strictest mode required by the managed tools for each run. + Default is evaluated as `never_require`. + """ + + super().__init__(source_id) + self._provider_tools = collect_tools(tools) + self._remove_tools_from_agent = remove_tools_from_agent + self._approval_mode = approval_mode + self._agent_tools: list[FunctionTool] | None = None + self._managed_tools: list[FunctionTool] = [] + self._base_signature: tuple[tuple[str, int], ...] = () + self._runtime_signature: tuple[tuple[str, int], ...] = () + self._module_path = Path( + os.environ.get( + "HYPERLIGHT_MODULE", str(Path(__file__).resolve().parents[3] / "src/python_sandbox/python-sandbox.aot") + ) + ) + if not self._module_path.exists(): + raise RuntimeError( + "Hyperlight Wasm module not found.\n" + f" module: {self._module_path} (MISSING)\n" + "Build the provisional python-sandbox AOT module first, or set " + "HYPERLIGHT_MODULE to the correct path." + ) + self._base_sandbox: Any = None + self._base_snapshot: Any = None + self._runtime_sandbox: Any = None + self._runtime_snapshot: Any = None + self._sandbox: Any = None + self._snapshot: Any = None + + self._execute_code_tool = FunctionTool( + name="execute_code", + description=( + "Python code to execute in an isolated sandbox. " + "Use call_tool(...) inside the code to access other tools." + ), + func=self._run_code, + input_model={ + "type": "object", + "properties": { + "code": { + "type": "string", + "description": ( + "Python code to execute in an isolated sandbox. " + "Use call_tool(...) inside the code to access other tools." + ), + } + }, + "required": ["code"], + }, + approval_mode=self._approval_mode, + ) + + @staticmethod + def _build_sandbox_and_snapshot(*, tools: Sequence[FunctionTool], module_path: Path) -> tuple[Any, Any]: + """Build a sandbox and clean snapshot for the given tool set.""" + sandbox = WasmSandbox(module_path=str(module_path)) + + for tool_obj in tools: + sandbox.register_tool(tool_obj.name, tool_obj.invoke) + + sandbox.run("None") + snapshot = sandbox.snapshot() + + logger.debug("Sandbox initialized and snapshotted.") + return sandbox, snapshot + + def _initialize_sandbox( + self, + *, + base_tools: Sequence[FunctionTool], + runtime_tools: Sequence[FunctionTool], + ) -> None: + """Initialize or reuse the appropriate base/runtime sandbox snapshot.""" + + managed_tools = collect_tools(base_tools, runtime_tools) + + base_signature = _tool_signature(base_tools) + if base_signature != self._base_signature: + self._base_signature = base_signature + self._base_sandbox = None + self._base_snapshot = None + self._runtime_signature = () + self._runtime_sandbox = None + self._runtime_snapshot = None + + if self._base_snapshot is None or self._base_sandbox is None: + self._base_sandbox, self._base_snapshot = self._build_sandbox_and_snapshot( + tools=base_tools, module_path=self._module_path + ) + + if not runtime_tools: + self._sandbox = self._base_sandbox + self._snapshot = self._base_snapshot + self._managed_tools = managed_tools + + runtime_signature = _tool_signature(runtime_tools) + if runtime_signature != self._runtime_signature: + self._runtime_signature = runtime_signature + self._runtime_sandbox = None + self._runtime_snapshot = None + + if self._runtime_snapshot is None or self._runtime_sandbox is None: + # TODO: Derive runtime snapshots from the restored base snapshot once + # the provisional Hyperlight API makes incremental tool layering practical. + self._runtime_sandbox, self._runtime_snapshot = self._build_sandbox_and_snapshot( + tools=managed_tools, module_path=self._module_path + ) + + self._sandbox = self._runtime_sandbox + self._snapshot = self._runtime_snapshot + self._managed_tools = managed_tools + + def _run_code(self, *, code: str) -> list[Content]: + """Restore the sandbox and execute one block of Python code.""" + + if self._sandbox is None or self._snapshot is None: + raise RuntimeError("Sandbox has not been initialized yet.") + + self._sandbox.restore(self._snapshot) + result = self._sandbox.run(code=code) + + success = bool(getattr(result, "success", False)) + stdout = str(getattr(result, "stdout", "") or "").replace("\r\n", "\n") + stderr = str(getattr(result, "stderr", "") or "") + + if success: + logger.debug("execute_code completed.") + contents: list[Content] = [] + if stdout: + contents.append(Content.from_text(stdout)) + if stderr: + contents.append( + Content.from_text( + f"stderr:\n{stderr}", + additional_properties={"stream": "stderr"}, + ) + ) + return contents or [Content.from_text("Code executed successfully without output.")] + + logger.debug("execute_code failed.") + error_details = stderr or "Unknown sandbox error" + return [ + Content.from_text(f"Execution error:\n{error_details}"), + Content.from_error(message="Execution error", error_details=error_details), + ] + + async def before_run( + self, + *, + agent: Any, + session: AgentSession | None, + context: SessionContext, + state: dict[str, Any], + ) -> None: # noqa: ARG002 + """Inject code-mode instructions and the execute_code tool before each run.""" + + if self._agent_tools is None and isinstance(agent, Agent): + self._agent_tools = collect_tools(agent.default_options.get("tools", [])) + + if self._remove_tools_from_agent: + agent.default_options["tools"] = [ + tool_obj + for tool_obj in agent.default_options.get("tools", []) + if getattr(tool_obj, "name", None) == "execute_code" + ] + + runtime_tools = collect_tools(context.options.get("tools")) + self._initialize_sandbox( + base_tools=collect_tools(self._provider_tools, self._agent_tools or []), + runtime_tools=runtime_tools, + ) + self._execute_code_tool.approval_mode = _resolve_execute_code_approval_mode( + base_approval_mode=self._approval_mode, + tools=self._managed_tools, + ) + + if self._remove_tools_from_agent: + context.options.pop("tools") + + context.extend_instructions( + self.source_id, + _build_code_mode_instructions( + tools=self._managed_tools, + tools_visible_to_model=not self._remove_tools_from_agent, + ), + ) + context.extend_tools(self.source_id, [self._execute_code_tool]) + + +async def main() -> None: + """Run the provider-managed code-mode sample.""" + + agent = Agent( + client=AzureOpenAIResponsesClient( + project_endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], + credential=AzureCliCredential(), + ), + name="HyperlightCodeModeProviderAgent", + instructions="You are a helpful assistant.", + tools=[compute, fetch_data], + context_providers=[CodeModeContextProvider(approval_mode="never_require")], + ) + + print("=" * 60) + print("ContextProvider sample") + print("=" * 60) + query = ( + "Fetch all users, find admins, multiply 6*7, and print the users, admins, " + "and multiplication result. Use one execute_code call." + ) + print(f"User: {query}") + result = await agent.run(query) + print(f"Agent: {result.text}") + + +""" +Sample output (shape only): + +Sandbox initialized and snapshotted (...) +============================================================ +ContextProvider sample +============================================================ +remove_tools_from_agent=True +approval_mode=never_require +User: Fetch all users, find admins, multiply 6*7, and print the users, admins, +and multiplication result. Use one execute_code call. +Agent: ... + +Notes: +- Pass tools to `CodeModeContextProvider(tools=[...])` to register sandbox-only + tools that are available through `call_tool(...)` but never exposed to the + model as separate tools. +- `remove_tools_from_agent` defaults to `True`, so the provider hides both + agent-configured and per-run tools from the model-facing tool list unless + you opt out. +- Set `approval_mode` on `CodeModeContextProvider(...)` to control the approval + behavior of the provider-managed `execute_code` tool. +- Pass tools to `agent.run(..., tools=runtime_tools)` to expose them as per-run + tools. The provider reads them from `context.options["tools"]`, registers + them with the sandbox, and clears them from the run options when removal is + enabled. +- This sample prioritizes the intended API shape over confirmed Hyperlight + runtime integration. +""" + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/02-agents/tools/code_mode_tool.py b/python/samples/02-agents/tools/code_mode_tool.py new file mode 100644 index 0000000000..565ccedc85 --- /dev/null +++ b/python/samples/02-agents/tools/code_mode_tool.py @@ -0,0 +1,347 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import asyncio +import json +import logging +import os +from collections.abc import Sequence +from pathlib import Path +from textwrap import indent +from typing import Annotated, Any + +from agent_framework import Agent, Content, FunctionTool, tool +from agent_framework._tools import normalize_tools +from agent_framework.azure import AzureOpenAIResponsesClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv +from pydantic import Field + +logger = logging.getLogger(__name__) + +"""This sample demonstrates a direct-tool Hyperlight code-mode prototype. + +The sample creates an `Agent(client=AzureOpenAIResponsesClient(...), ...)` with a +primary `execute_code` tool plus schema-visible tools. It also supports +per-run runtime tools by registering them with the sandbox before the run and +passing them through `agent.run(..., tools=runtime_tools)`. +""" + +DEFAULT_PROMPT = ( + "Fetch all users, find admins, multiply 6*7, and print the users, admins, " + "and multiplication result. Use one execute_code call." +) + +_SIMULATED_DATA: dict[str, list[dict[str, Any]]] = { + "users": [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "admin"}, + ], + "products": [ + {"id": 101, "name": "Widget", "price": 9.99}, + {"id": 102, "name": "Gadget", "price": 19.99}, + ], +} + + +def _repo_root() -> Path: + """Return the Python repo root used to resolve the default sandbox module path.""" + + return Path(__file__).resolve().parents[3] + + +def _default_module_path() -> Path: + """Return the provisional default path for the Hyperlight AOT module.""" + + return _repo_root() / "src/python_sandbox/python-sandbox.aot" + + +def collect_tools(*tool_groups: Any) -> list[FunctionTool]: + """Normalize and collect unique ``FunctionTool`` instances, excluding execute_code.""" + + tools: list[FunctionTool] = [] + seen_names: set[str] = set() + + for tool_group in tool_groups: + normalized_group: Sequence[Any] + if isinstance(tool_group, Sequence) and not isinstance(tool_group, (str, bytes, bytearray)) and all( + isinstance(tool_obj, FunctionTool) for tool_obj in tool_group + ): + normalized_group = tool_group + else: + normalized_group = normalize_tools(tool_group) + + for tool_obj in normalized_group: + if not isinstance(tool_obj, FunctionTool): + continue + + name = tool_obj.name + if name == "execute_code" or name in seen_names: + continue + + seen_names.add(name) + tools.append(tool_obj) + + return tools + + +def build_code_mode_instructions( + *, + tools: Sequence[FunctionTool], + tools_visible_to_model: bool, +) -> str: + """Build dynamic code-mode instructions for the discovered tools.""" + + if tools: + callback_lines = "\n\n".join( + [ + f"- `{tool_obj.name}`\n" + f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" + " Parameters:\n" + f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" + for tool_obj in tools + ] + ) + else: + callback_lines = "- No tools are currently registered inside the sandbox." + + visibility_note = ( + "The tools listed below may also appear as normal tools, but you should still prefer " + "execute_code and call them from inside the sandbox." + if tools_visible_to_model + else "The tools listed below are registered inside the sandbox even if they do not appear as " + "normal tools. Access them through execute_code with call_tool(...)." + ) + + return f"""You have one primary tool: execute_code. + +It runs Python in an isolated Hyperlight Wasm sandbox. You do NOT have direct +access to data. The ONLY way to fetch data or perform computations is by +writing Python code via execute_code that calls `call_tool()` inside the +sandbox. + +`call_tool` is a built-in global inside the sandbox. No import is needed. + +{visibility_note} + +Available sandbox tools: +{callback_lines} + +Correct usage: +result = call_tool("tool_name", keyword=value) + +You can combine multiple call_tool(...) calls with regular Python code in the +same execute_code block, including loops, conditionals, variables, and +post-processing of tool results. + +Wrong usage: +call_tool("tool_name", {{"keyword": "value"}}) + +Do NOT hardcode data that should come from call_tool(...). +Prefer one execute_code call per request when possible. +Always include the complete stdout from execute_code in your final answer. +""" + + +def _create_wasm_sandbox(*, module_path: Path) -> Any: + """Create the provisional Hyperlight Wasm sandbox instance.""" + + try: + from hyperlight_sandbox import WasmSandbox + except ModuleNotFoundError as exc: + raise RuntimeError( + "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " + "implementation. Install the provisional Hyperlight package once it " + "is available, or update this sample to match the final import path." + ) from exc + + return WasmSandbox(module_path=str(module_path)) + + +class CodeModeSandboxManager: + """Manage the provisional Hyperlight sandbox lifecycle for this sample.""" + + def __init__(self, *, module_path: Path | None = None) -> None: + """Initialize the sandbox manager.""" + + self._module_path = module_path or Path(os.environ.get("HYPERLIGHT_MODULE", str(_default_module_path()))) + self._tools: list[FunctionTool] = [] + self._callback_signature: tuple[tuple[str, int], ...] = () + self._sandbox: Any = None + self._snapshot: Any = None + + def set_tools(self, tools: Sequence[FunctionTool]) -> None: + """Set the tools that should be registered with the sandbox.""" + + signature = tuple((tool_obj.name, id(tool_obj)) for tool_obj in tools) + if signature == self._callback_signature: + return + + self._tools = list(tools) + self._callback_signature = signature + self._sandbox = None + self._snapshot = None + + def initialize(self) -> None: + """Initialize the sandbox once and capture a reusable clean snapshot.""" + + if self._sandbox is not None and self._snapshot is not None: + return + + if not self._module_path.exists(): + raise RuntimeError( + "Hyperlight Wasm module not found.\n" + f" module: {self._module_path} (MISSING)\n" + "Build the provisional python-sandbox AOT module first, or set " + "HYPERLIGHT_MODULE to the correct path." + ) + + self._sandbox = _create_wasm_sandbox(module_path=self._module_path) + + for tool_obj in self._tools: + self._sandbox.register_tool(tool_obj.name, tool_obj.invoke) + + self._sandbox.run("None") + self._snapshot = self._sandbox.snapshot() + + logger.debug("Sandbox initialized and snapshotted.") + + def run_code(self, *, code: str) -> list[Content]: + """Restore the sandbox and execute one block of Python code.""" + + if self._sandbox is None or self._snapshot is None: + raise RuntimeError("Sandbox has not been initialized yet.") + + logger.debug("--- Model generated code ---\n%s\n--- end ---\n", code) + + self._sandbox.restore(self._snapshot) + result = self._sandbox.run(code=code) + + success = bool(getattr(result, "success", False)) + stdout = str(getattr(result, "stdout", "") or "").replace("\r\n", "\n") + stderr = str(getattr(result, "stderr", "") or "") + + if success: + logger.debug("execute_code completed.") + contents: list[Content] = [] + if stdout: + contents.append(Content.from_text(stdout)) + if stderr: + contents.append( + Content.from_text( + f"stderr:\n{stderr}", + additional_properties={"stream": "stderr"}, + ) + ) + return contents or [Content.from_text("Code executed successfully without output.")] + + logger.debug("execute_code failed.") + error_details = stderr or "Unknown sandbox error" + return [ + Content.from_text(f"Execution error:\n{error_details}"), + Content.from_error(message="Execution error", error_details=error_details), + ] + + +@tool(approval_mode="never_require") +def compute( + operation: Annotated[str, "Math operation: add, subtract, multiply, or divide."], + a: Annotated[float, "First numeric operand."], + b: Annotated[float, "Second numeric operand."], +) -> float: + """Perform a math operation used by sandbox code.""" + + operations = { + "add": a + b, + "subtract": a - b, + "multiply": a * b, + "divide": a / b if b else float("inf"), + } + return operations.get(operation, 0.0) + + +@tool(approval_mode="never_require") +def fetch_data( + table: Annotated[str, "Name of the simulated table to query."], +) -> list[dict[str, Any]]: + """Fetch simulated records from a named table.""" + + return _SIMULATED_DATA.get(table, []) + + +async def main() -> None: + """Run the direct-tool code-mode sample.""" + + load_dotenv() + + runtime_tools: list[Any] = [] + sandbox_manager = CodeModeSandboxManager() + + @tool(name="execute_code", approval_mode="never_require") + async def execute_code( + code: Annotated[ + str, + Field( + description=( + "Python code to execute in an isolated Hyperlight Wasm sandbox. " + "Use call_tool(...) inside the code to access registered host callbacks." + ) + ), + ], + ) -> list[Content]: + """Execute code inside the provisional sandbox wrapper.""" + + return sandbox_manager.run_code(code=code) + + agent = Agent( + client=AzureOpenAIResponsesClient( + project_endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], + credential=AzureCliCredential(), + ), + name="HyperlightCodeModeToolAgent", + instructions="Temporary instructions replaced before the run.", + tools=[execute_code, compute, fetch_data], + ) + + tools = collect_tools(agent.default_options.get("tools", []), runtime_tools) + sandbox_manager.set_tools(tools) + sandbox_manager.initialize() + agent.default_options["instructions"] = build_code_mode_instructions( + tools=tools, + tools_visible_to_model=True, + ) + + logger.debug("%s", "=" * 60) + logger.debug("Direct tool sample") + logger.debug("%s", "=" * 60) + logger.debug("runtime_tool_count=%s", len(runtime_tools)) + logger.debug("User: %s", DEFAULT_PROMPT) + result = await agent.run(DEFAULT_PROMPT, tools=runtime_tools) + logger.debug("Agent: %s\n", result) + + +""" +Sample output (shape only): + +Sandbox initialized and snapshotted (...) +============================================================ +Direct tool sample +============================================================ +runtime_tool_count=0 +User: Fetch all users, find admins, multiply 6*7, and print the users, admins, +and multiplication result. Use one execute_code call. +Agent: ... + +Notes: +- Add tools to `runtime_tools` before calling `agent.run(...)` to expose them as + per-run tools and sandbox callbacks. +- This sample prioritizes the intended API shape over confirmed Hyperlight + runtime integration. +""" + + +if __name__ == "__main__": + asyncio.run(main()) From 4fbb5b20482b4b98ec803f68690a25de64c277ea Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Mon, 23 Mar 2026 11:01:16 +0100 Subject: [PATCH 02/17] updated samples --- .worktrees/devui_datastar | 1 + .worktrees/issue-4675-duplicate-telemetry | 1 + .worktrees/issue-4676-a2a-sdk-update | 1 + .../tools/code_mode_context_provider.py | 70 +++++++------- .../samples/02-agents/tools/code_mode_tool.py | 95 +++++++++---------- 5 files changed, 88 insertions(+), 80 deletions(-) create mode 160000 .worktrees/devui_datastar create mode 160000 .worktrees/issue-4675-duplicate-telemetry create mode 160000 .worktrees/issue-4676-a2a-sdk-update diff --git a/.worktrees/devui_datastar b/.worktrees/devui_datastar new file mode 160000 index 0000000000..bf8d9672e1 --- /dev/null +++ b/.worktrees/devui_datastar @@ -0,0 +1 @@ +Subproject commit bf8d9672e147c42696a5a17b0ed37878196b6715 diff --git a/.worktrees/issue-4675-duplicate-telemetry b/.worktrees/issue-4675-duplicate-telemetry new file mode 160000 index 0000000000..55cc6e85c0 --- /dev/null +++ b/.worktrees/issue-4675-duplicate-telemetry @@ -0,0 +1 @@ +Subproject commit 55cc6e85c08db4d7795a48e85261655efd895409 diff --git a/.worktrees/issue-4676-a2a-sdk-update b/.worktrees/issue-4676-a2a-sdk-update new file mode 160000 index 0000000000..c551983295 --- /dev/null +++ b/.worktrees/issue-4676-a2a-sdk-update @@ -0,0 +1 @@ +Subproject commit c5519832953763b847b7cacc515edb78cf50d28d diff --git a/python/samples/02-agents/tools/code_mode_context_provider.py b/python/samples/02-agents/tools/code_mode_context_provider.py index 2ce9ca5ff9..77aad2a9aa 100644 --- a/python/samples/02-agents/tools/code_mode_context_provider.py +++ b/python/samples/02-agents/tools/code_mode_context_provider.py @@ -1,3 +1,24 @@ +# /// script +# requires-python = ">=3.12,<3.13" +# dependencies = [ +# "hyperlight-sandbox", +# "hyperlight-sandbox-backend-wasm", +# "hyperlight-sandbox-python-guest", +# ] +# [tool.uv.sources] +# hyperlight-sandbox = { index = "testpypi" } +# hyperlight-sandbox-backend-wasm = { index = "testpypi" } +# hyperlight-sandbox-python-guest = { index = "testpypi" } +# [[tool.uv.index]] +# name = "testpypi" +# url = "https://test.pypi.org/simple/" +# explicit = true +# /// +# Bootstrap manually with: +# uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ +# hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest +# Run with: uv run --python 3.12 samples/02-agents/tools/code_mode_context_provider.py +# # Copyright (c) Microsoft. All rights reserved. from __future__ import annotations @@ -18,10 +39,10 @@ from dotenv import load_dotenv try: - from hyperlight_sandbox import WasmSandbox + from hyperlight_sandbox import Sandbox except ModuleNotFoundError as exc: raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " + "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " "implementation. Install the provisional Hyperlight package once it " "is available, or update this sample to match the final import path." ) from exc @@ -156,10 +177,10 @@ def _create_wasm_sandbox(*, module_path: Path) -> Any: """Create the provisional Hyperlight Wasm sandbox instance.""" try: - from hyperlight_sandbox import WasmSandbox + from hyperlight_sandbox import Sandbox except ModuleNotFoundError as exc: raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " + "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " "implementation. Install the provisional Hyperlight package once it " "is available, or update this sample to match the final import path." ) from exc @@ -171,7 +192,7 @@ def _create_wasm_sandbox(*, module_path: Path) -> Any: "HYPERLIGHT_MODULE to the correct path." ) - return WasmSandbox(module_path=str(module_path)) + return Sandbox(backend="wasm", module_path=str(module_path)) @tool(approval_mode="never_require") @@ -250,23 +271,12 @@ def __init__( self._managed_tools: list[FunctionTool] = [] self._base_signature: tuple[tuple[str, int], ...] = () self._runtime_signature: tuple[tuple[str, int], ...] = () - self._module_path = Path( - os.environ.get( - "HYPERLIGHT_MODULE", str(Path(__file__).resolve().parents[3] / "src/python_sandbox/python-sandbox.aot") - ) - ) - if not self._module_path.exists(): - raise RuntimeError( - "Hyperlight Wasm module not found.\n" - f" module: {self._module_path} (MISSING)\n" - "Build the provisional python-sandbox AOT module first, or set " - "HYPERLIGHT_MODULE to the correct path." - ) - self._base_sandbox: Any = None + self._module_path = "python_guest.path" + self._base_sandbox: Sandbox | None = None self._base_snapshot: Any = None - self._runtime_sandbox: Any = None + self._runtime_sandbox: Sandbox | None = None self._runtime_snapshot: Any = None - self._sandbox: Any = None + self._sandbox: Sandbox | None = None self._snapshot: Any = None self._execute_code_tool = FunctionTool( @@ -293,9 +303,9 @@ def __init__( ) @staticmethod - def _build_sandbox_and_snapshot(*, tools: Sequence[FunctionTool], module_path: Path) -> tuple[Any, Any]: + def _build_sandbox_and_snapshot(*, tools: Sequence[FunctionTool], module_path: str) -> tuple[Sandbox, Any]: """Build a sandbox and clean snapshot for the given tool set.""" - sandbox = WasmSandbox(module_path=str(module_path)) + sandbox = Sandbox(backend="wasm", module_path=module_path) for tool_obj in tools: sandbox.register_tool(tool_obj.name, tool_obj.invoke) @@ -361,26 +371,22 @@ def _run_code(self, *, code: str) -> list[Content]: self._sandbox.restore(self._snapshot) result = self._sandbox.run(code=code) - success = bool(getattr(result, "success", False)) - stdout = str(getattr(result, "stdout", "") or "").replace("\r\n", "\n") - stderr = str(getattr(result, "stderr", "") or "") - - if success: + if result.success: logger.debug("execute_code completed.") contents: list[Content] = [] - if stdout: - contents.append(Content.from_text(stdout)) - if stderr: + if result.stdout: + contents.append(Content.from_text(result.stdout)) + if result.stderr: contents.append( Content.from_text( - f"stderr:\n{stderr}", + f"stderr:\n{result.stderr}", additional_properties={"stream": "stderr"}, ) ) return contents or [Content.from_text("Code executed successfully without output.")] logger.debug("execute_code failed.") - error_details = stderr or "Unknown sandbox error" + error_details = result.stderr or "Unknown sandbox error" return [ Content.from_text(f"Execution error:\n{error_details}"), Content.from_error(message="Execution error", error_details=error_details), diff --git a/python/samples/02-agents/tools/code_mode_tool.py b/python/samples/02-agents/tools/code_mode_tool.py index 565ccedc85..d3c06d699a 100644 --- a/python/samples/02-agents/tools/code_mode_tool.py +++ b/python/samples/02-agents/tools/code_mode_tool.py @@ -1,3 +1,24 @@ +# /// script +# requires-python = ">=3.12,<3.13" +# dependencies = [ +# "hyperlight-sandbox", +# "hyperlight-sandbox-backend-wasm", +# "hyperlight-sandbox-python-guest", +# ] +# [tool.uv.sources] +# hyperlight-sandbox = { index = "testpypi" } +# hyperlight-sandbox-backend-wasm = { index = "testpypi" } +# hyperlight-sandbox-python-guest = { index = "testpypi" } +# [[tool.uv.index]] +# name = "testpypi" +# url = "https://test.pypi.org/simple/" +# explicit = true +# /// +# Bootstrap manually with: +# uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ +# hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest +# Run with: uv run --python 3.12 samples/02-agents/tools/code_mode_tool.py +# # Copyright (c) Microsoft. All rights reserved. from __future__ import annotations @@ -7,7 +28,6 @@ import logging import os from collections.abc import Sequence -from pathlib import Path from textwrap import indent from typing import Annotated, Any @@ -16,7 +36,17 @@ from agent_framework.azure import AzureOpenAIResponsesClient from azure.identity import AzureCliCredential from dotenv import load_dotenv -from pydantic import Field + +try: + from hyperlight_sandbox import Sandbox +except ModuleNotFoundError as exc: + raise RuntimeError( + "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " + "implementation. Install the provisional Hyperlight packages from TestPyPI, " + "or update this sample to match the final import path." + ) from exc + +load_dotenv() logger = logging.getLogger(__name__) @@ -45,17 +75,7 @@ ], } - -def _repo_root() -> Path: - """Return the Python repo root used to resolve the default sandbox module path.""" - - return Path(__file__).resolve().parents[3] - - -def _default_module_path() -> Path: - """Return the provisional default path for the Hyperlight AOT module.""" - - return _repo_root() / "src/python_sandbox/python-sandbox.aot" +DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" def collect_tools(*tool_groups: Any) -> list[FunctionTool]: @@ -145,28 +165,19 @@ def build_code_mode_instructions( """ -def _create_wasm_sandbox(*, module_path: Path) -> Any: +def _create_wasm_sandbox(*, module_ref: str) -> Sandbox: """Create the provisional Hyperlight Wasm sandbox instance.""" - try: - from hyperlight_sandbox import WasmSandbox - except ModuleNotFoundError as exc: - raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.WasmSandbox` " - "implementation. Install the provisional Hyperlight package once it " - "is available, or update this sample to match the final import path." - ) from exc - - return WasmSandbox(module_path=str(module_path)) + return Sandbox(backend="wasm", module=module_ref) class CodeModeSandboxManager: """Manage the provisional Hyperlight sandbox lifecycle for this sample.""" - def __init__(self, *, module_path: Path | None = None) -> None: + def __init__(self, *, module_ref: str | None = None) -> None: """Initialize the sandbox manager.""" - self._module_path = module_path or Path(os.environ.get("HYPERLIGHT_MODULE", str(_default_module_path()))) + self._module_ref = module_ref or os.environ.get("HYPERLIGHT_MODULE", DEFAULT_HYPERLIGHT_MODULE) self._tools: list[FunctionTool] = [] self._callback_signature: tuple[tuple[str, int], ...] = () self._sandbox: Any = None @@ -190,15 +201,7 @@ def initialize(self) -> None: if self._sandbox is not None and self._snapshot is not None: return - if not self._module_path.exists(): - raise RuntimeError( - "Hyperlight Wasm module not found.\n" - f" module: {self._module_path} (MISSING)\n" - "Build the provisional python-sandbox AOT module first, or set " - "HYPERLIGHT_MODULE to the correct path." - ) - - self._sandbox = _create_wasm_sandbox(module_path=self._module_path) + self._sandbox = _create_wasm_sandbox(module_ref=self._module_ref) for tool_obj in self._tools: self._sandbox.register_tool(tool_obj.name, tool_obj.invoke) @@ -274,8 +277,6 @@ def fetch_data( async def main() -> None: """Run the direct-tool code-mode sample.""" - load_dotenv() - runtime_tools: list[Any] = [] sandbox_manager = CodeModeSandboxManager() @@ -283,11 +284,9 @@ async def main() -> None: async def execute_code( code: Annotated[ str, - Field( - description=( - "Python code to execute in an isolated Hyperlight Wasm sandbox. " - "Use call_tool(...) inside the code to access registered host callbacks." - ) + ( + "Python code to execute in an isolated Hyperlight Wasm sandbox. " + "Use call_tool(...) inside the code to access registered host callbacks." ), ], ) -> list[Content]: @@ -314,13 +313,13 @@ async def execute_code( tools_visible_to_model=True, ) - logger.debug("%s", "=" * 60) - logger.debug("Direct tool sample") - logger.debug("%s", "=" * 60) - logger.debug("runtime_tool_count=%s", len(runtime_tools)) - logger.debug("User: %s", DEFAULT_PROMPT) + print("=" * 60) + print("Direct tool sample") + print("=" * 60) + print(f"runtime_tool_count={len(runtime_tools)}") + print(f"User: {DEFAULT_PROMPT}") result = await agent.run(DEFAULT_PROMPT, tools=runtime_tools) - logger.debug("Agent: %s\n", result) + print(f"Agent: {result.text}") """ From 03a0bd7bee500665902dc05916b66c1794386692 Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Mon, 23 Mar 2026 16:37:08 +0100 Subject: [PATCH 03/17] updates to codeact --- ...rovider.py => codeact_context_provider.py} | 328 +++++++++++------- .../{code_mode_tool.py => codeact_tool.py} | 44 +-- 2 files changed, 218 insertions(+), 154 deletions(-) rename python/samples/02-agents/tools/{code_mode_context_provider.py => codeact_context_provider.py} (63%) rename python/samples/02-agents/tools/{code_mode_tool.py => codeact_tool.py} (90%) diff --git a/python/samples/02-agents/tools/code_mode_context_provider.py b/python/samples/02-agents/tools/codeact_context_provider.py similarity index 63% rename from python/samples/02-agents/tools/code_mode_context_provider.py rename to python/samples/02-agents/tools/codeact_context_provider.py index 77aad2a9aa..c2829e55ca 100644 --- a/python/samples/02-agents/tools/code_mode_context_provider.py +++ b/python/samples/02-agents/tools/codeact_context_provider.py @@ -17,7 +17,7 @@ # Bootstrap manually with: # uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ # hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest -# Run with: uv run --python 3.12 samples/02-agents/tools/code_mode_context_provider.py +# Run with: uv run --python 3.12 samples/02-agents/tools/codeact_context_provider.py # # Copyright (c) Microsoft. All rights reserved. @@ -27,12 +27,21 @@ import json import logging import os -from collections.abc import Sequence -from pathlib import Path +from collections.abc import Awaitable, Callable, Sequence from textwrap import indent from typing import Annotated, Any, Literal -from agent_framework import Agent, AgentSession, BaseContextProvider, Content, FunctionTool, SessionContext, tool +from agent_framework import ( + Agent, + AgentSession, + BaseContextProvider, + Content, + FunctionInvocationContext, + FunctionTool, + SessionContext, + function_middleware, + tool, +) from agent_framework._tools import normalize_tools from agent_framework.azure import AzureOpenAIResponsesClient from azure.identity import AzureCliCredential @@ -49,17 +58,41 @@ load_dotenv() +# ANSI color helpers for distinguishing output sources. +_CYAN = "\033[36m" +_YELLOW = "\033[33m" +_GREEN = "\033[32m" +_DIM = "\033[2m" +_RESET = "\033[0m" + + +class _ColoredFormatter(logging.Formatter): + """Dim logger output so it doesn't compete with middleware and main prints.""" + + def format(self, record: logging.LogRecord) -> str: + msg = super().format(record) + return f"{_DIM}{msg}{_RESET}" + + +logging.basicConfig(level=logging.WARNING) +logging.getLogger().handlers[0].setFormatter( + _ColoredFormatter("[%(asctime)s] %(levelname)s: %(message)s"), +) logger = logging.getLogger(__name__) -"""This sample demonstrates a ContextProvider-driven Hyperlight code-mode prototype. +"""This sample demonstrates a ContextProvider-driven Hyperlight CodeAct prototype. + +The provider owns sandbox lifecycle and the tools registered within it. +Tools are passed directly to the provider — not the agent — so the model +only sees the single ``execute_code`` tool. -The provider owns sandbox lifecycle, discovers tools from the agent, -and injects dynamic instructions plus a single `execute_code` tool at run time. +A logging function middleware is registered on the agent to show every tool +invocation (name, arguments, timing, and result) in the console output. -Tools passed to `agent.run(..., tools=...)` are available in -`context.options["tools"]`, so the provider can merge them into the sandbox and, -when configured, remove them from the model-facing run options too. +Per-run tools passed to ``agent.run(..., tools=...)`` are also captured by the +provider, registered with the sandbox, and removed from the model-facing tool +list. """ @@ -116,12 +149,12 @@ def _tool_signature(tools: Sequence[FunctionTool]) -> tuple[tuple[str, int], ... return tuple((tool_obj.name, id(tool_obj)) for tool_obj in tools) -def _build_code_mode_instructions( +def _build_codeact_instructions( *, tools: Sequence[FunctionTool], tools_visible_to_model: bool, ) -> str: - """Build dynamic code-mode instructions for the discovered tools.""" + """Build dynamic CodeAct instructions for the discovered tools.""" if tools: tools_descriptions = "\n\n".join([ @@ -145,12 +178,13 @@ def _build_code_mode_instructions( return f"""You have one primary tool: execute_code. -It runs Python in an isolated Hyperlight Wasm sandbox. You do NOT have direct +It runs Python in an isolated sandbox. You do NOT have direct access to data. The ONLY way to fetch data or perform computations is by writing Python code via execute_code that calls `call_tool()` inside the sandbox. `call_tool` is a built-in global inside the sandbox. No import is needed. +You can chain multiple call_tool(...) calls in the same code block, and you can also use regular Python code to post-process tool results, define variables, or control flow with conditionals and loops. {visibility_note} @@ -160,88 +194,33 @@ def _build_code_mode_instructions( Correct usage: result = call_tool("tool_name", keyword=value) -You can combine multiple call_tool(...) calls with regular Python code in the -same execute_code block, including loops, conditionals, variables, and -post-processing of tool results. - Wrong usage: call_tool("tool_name", {{"keyword": "value"}}) Do NOT hardcode data that should come from call_tool(...). Prefer one execute_code call per request when possible. -Always include the complete stdout from execute_code in your final answer. """ -def _create_wasm_sandbox(*, module_path: Path) -> Any: - """Create the provisional Hyperlight Wasm sandbox instance.""" - - try: - from hyperlight_sandbox import Sandbox - except ModuleNotFoundError as exc: - raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " - "implementation. Install the provisional Hyperlight package once it " - "is available, or update this sample to match the final import path." - ) from exc - if not module_path.exists(): - raise RuntimeError( - "Hyperlight Wasm module not found.\n" - f" module: {module_path} (MISSING)\n" - "Build the provisional python-sandbox AOT module first, or set " - "HYPERLIGHT_MODULE to the correct path." - ) - - return Sandbox(backend="wasm", module_path=str(module_path)) +class CodeActContextProvider(BaseContextProvider): + """Inject a CodeAct surface using provider-owned tools. + Tools passed to the provider are registered with the sandbox and made + available to the model exclusively through ``execute_code``. They are + never added to the model-facing tool list — only ``execute_code`` is. -@tool(approval_mode="never_require") -def compute( - operation: Annotated[str, "Math operation: add, subtract, multiply, or divide."], - a: Annotated[float, "First numeric operand."], - b: Annotated[float, "Second numeric operand."], -) -> float: - """Perform a math operation used by sandbox code.""" - - operations = { - "add": a + b, - "subtract": a - b, - "multiply": a * b, - "divide": a / b if b else float("inf"), - } - return operations.get(operation, 0.0) + Per-run tools passed to ``agent.run(..., tools=...)`` are captured from + ``context.options["tools"]``, registered with the sandbox for the + duration of the run, and removed from the model-facing run options. + """ - -@tool(approval_mode="never_require") -def fetch_data( - table: Annotated[str, "Name of the simulated table to query."], -) -> list[dict[str, Any]]: - """Fetch simulated records from a named table.""" - - return { - "users": [ - {"id": 1, "name": "Alice", "role": "admin"}, - {"id": 2, "name": "Bob", "role": "user"}, - {"id": 3, "name": "Charlie", "role": "admin"}, - ], - "products": [ - {"id": 101, "name": "Widget", "price": 9.99}, - {"id": 102, "name": "Gadget", "price": 19.99}, - ], - }.get(table, []) - - -class CodeModeContextProvider(BaseContextProvider): - """Inject a code-mode surface using agent-configured tools.""" - - DEFAULT_SOURCE_ID = "code_mode_provider" + DEFAULT_SOURCE_ID = "codeact_provider" def __init__( self, source_id: str = DEFAULT_SOURCE_ID, *, tools: Sequence[FunctionTool] | None = None, - remove_tools_from_agent: bool = True, approval_mode: Literal["always_require", "never_require"] | None = None, ) -> None: """Initialize the provider. @@ -250,13 +229,10 @@ def __init__( source_id: Unique provider source identifier. Keyword Args: - tools: Additional sandbox-managed tools owned by the provider. + tools: Sandbox-managed tools owned by the provider. These are available through ``call_tool(...)`` inside ``execute_code`` and are never surfaced to the model as separate tools. - remove_tools_from_agent: When True, remove the - tools from the model-facing tool list after the provider - captures them, including tools passed at run time. approval_mode: Base approval mode for the provider-managed `execute_code` tool. The effective mode is upgraded to the strictest mode required by the managed tools for each run. @@ -265,9 +241,7 @@ def __init__( super().__init__(source_id) self._provider_tools = collect_tools(tools) - self._remove_tools_from_agent = remove_tools_from_agent self._approval_mode = approval_mode - self._agent_tools: list[FunctionTool] | None = None self._managed_tools: list[FunctionTool] = [] self._base_signature: tuple[tuple[str, int], ...] = () self._runtime_signature: tuple[tuple[str, int], ...] = () @@ -375,18 +349,18 @@ def _run_code(self, *, code: str) -> list[Content]: logger.debug("execute_code completed.") contents: list[Content] = [] if result.stdout: - contents.append(Content.from_text(result.stdout)) + contents.append(Content.from_text(result.stdout.strip())) if result.stderr: contents.append( Content.from_text( - f"stderr:\n{result.stderr}", + f"stderr:\n{result.stderr.strip()}", additional_properties={"stream": "stderr"}, ) ) return contents or [Content.from_text("Code executed successfully without output.")] logger.debug("execute_code failed.") - error_details = result.stderr or "Unknown sandbox error" + error_details = result.stderr.strip() if result.stderr else "Unknown sandbox error" return [ Content.from_text(f"Execution error:\n{error_details}"), Content.from_error(message="Execution error", error_details=error_details), @@ -400,21 +374,12 @@ async def before_run( context: SessionContext, state: dict[str, Any], ) -> None: # noqa: ARG002 - """Inject code-mode instructions and the execute_code tool before each run.""" - - if self._agent_tools is None and isinstance(agent, Agent): - self._agent_tools = collect_tools(agent.default_options.get("tools", [])) - - if self._remove_tools_from_agent: - agent.default_options["tools"] = [ - tool_obj - for tool_obj in agent.default_options.get("tools", []) - if getattr(tool_obj, "name", None) == "execute_code" - ] + """Inject CodeAct instructions and the execute_code tool before each run.""" - runtime_tools = collect_tools(context.options.get("tools")) + # Capture and remove per-run tools so they are only available in the sandbox. + runtime_tools = collect_tools(context.options.pop("tools", None)) self._initialize_sandbox( - base_tools=collect_tools(self._provider_tools, self._agent_tools or []), + base_tools=self._provider_tools, runtime_tools=runtime_tools, ) self._execute_code_tool.approval_mode = _resolve_execute_code_approval_mode( @@ -422,72 +387,171 @@ async def before_run( tools=self._managed_tools, ) - if self._remove_tools_from_agent: - context.options.pop("tools") - context.extend_instructions( self.source_id, - _build_code_mode_instructions( + _build_codeact_instructions( tools=self._managed_tools, - tools_visible_to_model=not self._remove_tools_from_agent, + tools_visible_to_model=False, ), ) context.extend_tools(self.source_id, [self._execute_code_tool]) +# 1. Define a logging function middleware to observe tool invocations. +@function_middleware +async def log_function_calls( + context: FunctionInvocationContext, + call_next: Callable[[], Awaitable[None]], +) -> None: + """Log every tool call with readable code output and timing.""" + import time + + func_name = context.function.name + args = context.arguments if isinstance(context.arguments, dict) else {} + + # For execute_code, print the generated code as a readable block. + if func_name == "execute_code" and "code" in args: + print(f"\n{_YELLOW}{'─' * 60}") + print("▶ execute_code") + print(f"{'─' * 60}{_RESET}") + print(args["code"]) + print(f"{_YELLOW}{'─' * 60}{_RESET}") + else: + print(f"\n{_YELLOW}▶ {func_name}({', '.join(f'{k}={v!r}' for k, v in args.items())}){_RESET}") + + start = time.perf_counter() + await call_next() + elapsed = time.perf_counter() - start + + # Show the result concisely — full stdout for execute_code, repr for others. + result = context.result + if func_name == "execute_code" and isinstance(result, list): + for item in result: + text = getattr(item, "text", None) + if text: + print(f"{_GREEN}stdout:\n{text}{_RESET}") + else: + print(f"{_YELLOW}◀ {func_name} → {result!r}{_RESET}") + + print(f"{_DIM} ({elapsed:.4f}s){_RESET}") + + +@tool(approval_mode="never_require") +def compute( + operation: Annotated[Literal['add', 'subtract', 'multiply', 'divide'], "Math operation: add, subtract, multiply, or divide."], + a: Annotated[float, "First numeric operand."], + b: Annotated[float, "Second numeric operand."], +) -> float: + """Perform a math operation, use this function instead of raw code, because it is safer.""" + + logger.warning("compute called with operation=%r, a=%r, b=%r", operation, a, b) + + operations = { + "add": a + b, + "subtract": a - b, + "multiply": a * b, + "divide": a / b if b else float("inf"), + } + return operations.get(operation, 0.0) + + +@tool(approval_mode="never_require") +async def fetch_data( + table: Annotated[str, "Name of the simulated table to query."], +) -> list[dict[str, Any]]: + """Fetch records from a named table. + + There are two tables, with the columns shown below: + - users: id, name, role + - products: id, name, price + """ + + logger.warning("fetch_data called with table=%r", table) + + await asyncio.sleep(0.5) # Simulate some latency + + return { + "users": [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "admin"}, + ], + "products": [ + {"id": 101, "name": "Widget", "price": 9.99}, + {"id": 102, "name": "Gadget", "price": 19.99}, + ], + }.get(table, []) + + async def main() -> None: - """Run the provider-managed code-mode sample.""" + """Run the provider-managed CodeAct sample.""" + # Tools are passed to the provider (not the agent) so they are only + # available inside the sandbox via call_tool(...) and never appear as + # separate model-facing tools. agent = Agent( client=AzureOpenAIResponsesClient( project_endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], credential=AzureCliCredential(), ), - name="HyperlightCodeModeProviderAgent", + name="HyperlightCodeActProviderAgent", instructions="You are a helpful assistant.", - tools=[compute, fetch_data], - context_providers=[CodeModeContextProvider(approval_mode="never_require")], + context_providers=[ + CodeActContextProvider(tools=[compute, fetch_data], approval_mode="never_require"), + ], + middleware=[log_function_calls], ) - print("=" * 60) - print("ContextProvider sample") - print("=" * 60) + print(f"{_CYAN}{'=' * 60}") + print("CodeAct ContextProvider sample") + print(f"{'=' * 60}{_RESET}") query = ( - "Fetch all users, find admins, multiply 6*7, and print the users, admins, " - "and multiplication result. Use one execute_code call." + "Fetch all users, find admins, multiply 7*(3*2), and print the users, admins, " + "and multiplication result. Use the execute_code call, and try to do as much as possible inside the sandbox with call_tool(...) instead of in raw code outside." ) - print(f"User: {query}") + print(f"{_CYAN}User: {query}{_RESET}") result = await agent.run(query) - print(f"Agent: {result.text}") + print(f"{_CYAN}Agent: {result.text}{_RESET}") """ Sample output (shape only): -Sandbox initialized and snapshotted (...) ============================================================ -ContextProvider sample +CodeAct ContextProvider sample ============================================================ -remove_tools_from_agent=True -approval_mode=never_require -User: Fetch all users, find admins, multiply 6*7, and print the users, admins, -and multiplication result. Use one execute_code call. +User: Fetch all users, find admins, multiply 6*7, ... + +──────────────────────────────────────────────────────────── +▶ execute_code +──────────────────────────────────────────────────────────── +users = call_tool("fetch_data", table="users") +admins = [u for u in users if u["role"] == "admin"] +result = call_tool("compute", operation="multiply", a=6, b=7) +print("Users:", users) +print("Admins:", admins) +print("6 * 7 =", result) +──────────────────────────────────────────────────────────── +stdout: +Users: [...] +Admins: [...] +6 * 7 = 42.0 + (0.0452s) Agent: ... Notes: -- Pass tools to `CodeModeContextProvider(tools=[...])` to register sandbox-only - tools that are available through `call_tool(...)` but never exposed to the - model as separate tools. -- `remove_tools_from_agent` defaults to `True`, so the provider hides both - agent-configured and per-run tools from the model-facing tool list unless - you opt out. -- Set `approval_mode` on `CodeModeContextProvider(...)` to control the approval +- Tools are passed to `CodeActContextProvider(tools=[...])`, NOT to the agent. + This ensures they are only available inside the sandbox via `call_tool(...)`. + The model only sees the `execute_code` tool. +- The logging middleware prints the model-generated code as a readable block + and shows its stdout, so you can trace exactly what the agent does. +- Set `approval_mode` on `CodeActContextProvider(...)` to control the approval behavior of the provider-managed `execute_code` tool. - Pass tools to `agent.run(..., tools=runtime_tools)` to expose them as per-run - tools. The provider reads them from `context.options["tools"]`, registers - them with the sandbox, and clears them from the run options when removal is - enabled. + sandbox tools. The provider captures them from `context.options["tools"]`, + registers them with the sandbox, and removes them from the model-facing run + options. - This sample prioritizes the intended API shape over confirmed Hyperlight runtime integration. """ diff --git a/python/samples/02-agents/tools/code_mode_tool.py b/python/samples/02-agents/tools/codeact_tool.py similarity index 90% rename from python/samples/02-agents/tools/code_mode_tool.py rename to python/samples/02-agents/tools/codeact_tool.py index d3c06d699a..119f983142 100644 --- a/python/samples/02-agents/tools/code_mode_tool.py +++ b/python/samples/02-agents/tools/codeact_tool.py @@ -17,7 +17,7 @@ # Bootstrap manually with: # uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ # hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest -# Run with: uv run --python 3.12 samples/02-agents/tools/code_mode_tool.py +# Run with: uv run --python 3.12 samples/02-agents/tools/codeact_tool.py # # Copyright (c) Microsoft. All rights reserved. @@ -50,7 +50,7 @@ logger = logging.getLogger(__name__) -"""This sample demonstrates a direct-tool Hyperlight code-mode prototype. +"""This sample demonstrates a direct-tool Hyperlight CodeAct prototype. The sample creates an `Agent(client=AzureOpenAIResponsesClient(...), ...)` with a primary `execute_code` tool plus schema-visible tools. It also supports @@ -86,8 +86,10 @@ def collect_tools(*tool_groups: Any) -> list[FunctionTool]: for tool_group in tool_groups: normalized_group: Sequence[Any] - if isinstance(tool_group, Sequence) and not isinstance(tool_group, (str, bytes, bytearray)) and all( - isinstance(tool_obj, FunctionTool) for tool_obj in tool_group + if ( + isinstance(tool_group, Sequence) + and not isinstance(tool_group, (str, bytes, bytearray)) + and all(isinstance(tool_obj, FunctionTool) for tool_obj in tool_group) ): normalized_group = tool_group else: @@ -107,23 +109,21 @@ def collect_tools(*tool_groups: Any) -> list[FunctionTool]: return tools -def build_code_mode_instructions( +def build_codeact_instructions( *, tools: Sequence[FunctionTool], tools_visible_to_model: bool, ) -> str: - """Build dynamic code-mode instructions for the discovered tools.""" + """Build dynamic CodeAct instructions for the discovered tools.""" if tools: - callback_lines = "\n\n".join( - [ - f"- `{tool_obj.name}`\n" - f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" - " Parameters:\n" - f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" - for tool_obj in tools - ] - ) + callback_lines = "\n\n".join([ + f"- `{tool_obj.name}`\n" + f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" + " Parameters:\n" + f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" + for tool_obj in tools + ]) else: callback_lines = "- No tools are currently registered inside the sandbox." @@ -171,7 +171,7 @@ def _create_wasm_sandbox(*, module_ref: str) -> Sandbox: return Sandbox(backend="wasm", module=module_ref) -class CodeModeSandboxManager: +class CodeActSandboxManager: """Manage the provisional Hyperlight sandbox lifecycle for this sample.""" def __init__(self, *, module_ref: str | None = None) -> None: @@ -275,10 +275,10 @@ def fetch_data( async def main() -> None: - """Run the direct-tool code-mode sample.""" + """Run the direct-tool CodeAct sample.""" runtime_tools: list[Any] = [] - sandbox_manager = CodeModeSandboxManager() + sandbox_manager = CodeActSandboxManager() @tool(name="execute_code", approval_mode="never_require") async def execute_code( @@ -300,7 +300,7 @@ async def execute_code( deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], credential=AzureCliCredential(), ), - name="HyperlightCodeModeToolAgent", + name="HyperlightCodeActToolAgent", instructions="Temporary instructions replaced before the run.", tools=[execute_code, compute, fetch_data], ) @@ -308,13 +308,13 @@ async def execute_code( tools = collect_tools(agent.default_options.get("tools", []), runtime_tools) sandbox_manager.set_tools(tools) sandbox_manager.initialize() - agent.default_options["instructions"] = build_code_mode_instructions( + agent.default_options["instructions"] = build_codeact_instructions( tools=tools, tools_visible_to_model=True, ) print("=" * 60) - print("Direct tool sample") + print("CodeAct direct tool sample") print("=" * 60) print(f"runtime_tool_count={len(runtime_tools)}") print(f"User: {DEFAULT_PROMPT}") @@ -327,7 +327,7 @@ async def execute_code( Sandbox initialized and snapshotted (...) ============================================================ -Direct tool sample +CodeAct direct tool sample ============================================================ runtime_tool_count=0 User: Fetch all users, find admins, multiply 6*7, and print the users, admins, From 37106d940c066b0b0e42d15da0738556668adcee Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Tue, 24 Mar 2026 16:43:45 +0100 Subject: [PATCH 04/17] udpated codeact --- .../tools/codeact_context_provider.py | 69 ++++++++++++++++--- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/python/samples/02-agents/tools/codeact_context_provider.py b/python/samples/02-agents/tools/codeact_context_provider.py index c2829e55ca..5d407d0fd6 100644 --- a/python/samples/02-agents/tools/codeact_context_provider.py +++ b/python/samples/02-agents/tools/codeact_context_provider.py @@ -96,6 +96,41 @@ def format(self, record: logging.LogRecord) -> str: """ +def _passthrough_result_parser(result: Any) -> str: + """Return a Python repr so sandbox code sees native-looking values. + + Using ``repr`` instead of ``json.dumps`` ensures the text can be + round-tripped back to a native Python value with ``ast.literal_eval``. + """ + return repr(result) + + +def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: + """Wrap a tool's ``invoke`` so ``call_tool`` returns native Python values. + + ``invoke()`` always returns ``list[Content]``. This wrapper extracts + the text, parses it back with ``ast.literal_eval``, and returns a + single value (not a list) when there is exactly one result item. + """ + + async def _callback(**kwargs: Any) -> Any: + import ast + + contents = await tool_obj.invoke(**kwargs) + values: list[Any] = [] + for c in contents: + if c.text is not None: + try: + values.append(ast.literal_eval(c.text)) + except (ValueError, SyntaxError): + values.append(c.text) + if len(values) == 1: + return values[0] + return values + + return _callback + + def collect_tools(*tool_groups: Any) -> list[FunctionTool]: """Normalize and collect unique ``FunctionTool`` instances, excluding execute_code.""" @@ -184,21 +219,31 @@ def _build_codeact_instructions( sandbox. `call_tool` is a built-in global inside the sandbox. No import is needed. -You can chain multiple call_tool(...) calls in the same code block, and you can also use regular Python code to post-process tool results, define variables, or control flow with conditionals and loops. + +CRITICAL: call_tool takes the tool name as first argument, then KEYWORD +arguments only. Never pass a dict as a positional argument. {visibility_note} Available sandbox tools: {tools_descriptions} -Correct usage: -result = call_tool("tool_name", keyword=value) +Correct examples: + result = call_tool("tool_name", keyword=value) + data = call_tool("fetch_data", table="users") + x = call_tool("compute", operation="multiply", a=3, b=7) -Wrong usage: -call_tool("tool_name", {{"keyword": "value"}}) +WRONG — these will fail: + call_tool("tool_name", {{"keyword": "value"}}) # dict as positional arg + call_tool("tool_name", "value") # positional arg + +call_tool returns native Python values (int, float, str, list, dict), +so you can use results directly in subsequent code: + data = call_tool("fetch_data", table="users") + total = call_tool("compute", operation="add", a=data[0]["price"], b=data[1]["price"]) -Do NOT hardcode data that should come from call_tool(...). Prefer one execute_code call per request when possible. +Do NOT hardcode data that should come from call_tool(...). """ @@ -241,6 +286,8 @@ def __init__( super().__init__(source_id) self._provider_tools = collect_tools(tools) + for t in self._provider_tools: + t.result_parser = _passthrough_result_parser self._approval_mode = approval_mode self._managed_tools: list[FunctionTool] = [] self._base_signature: tuple[tuple[str, int], ...] = () @@ -282,7 +329,7 @@ def _build_sandbox_and_snapshot(*, tools: Sequence[FunctionTool], module_path: s sandbox = Sandbox(backend="wasm", module_path=module_path) for tool_obj in tools: - sandbox.register_tool(tool_obj.name, tool_obj.invoke) + sandbox.register_tool(tool_obj.name, _make_sandbox_callback(tool_obj)) sandbox.run("None") snapshot = sandbox.snapshot() @@ -378,6 +425,8 @@ async def before_run( # Capture and remove per-run tools so they are only available in the sandbox. runtime_tools = collect_tools(context.options.pop("tools", None)) + for t in runtime_tools: + t.result_parser = _passthrough_result_parser self._initialize_sandbox( base_tools=self._provider_tools, runtime_tools=runtime_tools, @@ -438,7 +487,9 @@ async def log_function_calls( @tool(approval_mode="never_require") def compute( - operation: Annotated[Literal['add', 'subtract', 'multiply', 'divide'], "Math operation: add, subtract, multiply, or divide."], + operation: Annotated[ + Literal["add", "subtract", "multiply", "divide"], "Math operation: add, subtract, multiply, or divide." + ], a: Annotated[float, "First numeric operand."], b: Annotated[float, "Second numeric operand."], ) -> float: @@ -495,7 +546,7 @@ async def main() -> None: deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], credential=AzureCliCredential(), ), - name="HyperlightCodeActProviderAgent", + name="CodeActProviderAgent", instructions="You are a helpful assistant.", context_providers=[ CodeActContextProvider(tools=[compute, fetch_data], approval_mode="never_require"), From f0ddf5dec63752a3de30897994ea71bcf59c07aa Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Tue, 7 Apr 2026 20:49:05 +0200 Subject: [PATCH 05/17] Draft CodeAct ADR and sample updates Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/decisions/0024-codeact-integration.md | 213 +++++++++++ .../code_act/dotnet-implementation.md | 6 + .../code_act/python-implementation.md | 340 ++++++++++++++++++ .../tools/codeact_context_provider.py | 23 +- .../samples/02-agents/tools/codeact_tool.py | 25 +- 5 files changed, 572 insertions(+), 35 deletions(-) create mode 100644 docs/decisions/0024-codeact-integration.md create mode 100644 docs/features/code_act/dotnet-implementation.md create mode 100644 docs/features/code_act/python-implementation.md diff --git a/docs/decisions/0024-codeact-integration.md b/docs/decisions/0024-codeact-integration.md new file mode 100644 index 0000000000..8b9f6e1bd3 --- /dev/null +++ b/docs/decisions/0024-codeact-integration.md @@ -0,0 +1,213 @@ +--- +status: proposed +contact: eavanvalkenburg +date: 2026-04-07 +deciders: TBD +consulted: +informed: +--- + +# CodeAct integration through backend-specific context providers and an `execute_code` tool + +## Context and Problem Statement + +We need a architecture design that supports CodeAct in both Python and .NET. This is a necessary capability for the current generation of long running agents, which need to plan, iterate, transform tool outputs, and execute bounded code inside a controlled runtime instead of pushing every intermediate step back through the model. The design should preserve the same behavioral contract across SDKs, but it does not need to use the same internal extension point in each runtime. We also want to standardize on Hyperlight as the initial backend, using the existing Python package and an anticipated .NET binding package once it is available. + +Throughout this ADR, **CodeAct** is the primary term. **Code mode** and **programmatic tool calling** refer to the same capability. This ADR uses **CodeAct** consistently. + +The core design question is: **where should CodeAct integrate into the agent pipeline so that both SDKs can offer the same functionality without invasive changes to their core function-calling loops?** + +## Decision Drivers + +- CodeAct must shape the model-facing surface before model invocation, not only after the model has already chosen tools. +- The design should let users control which tools are available through CodeAct and which remain regular tools only. +- The design must preserve existing session, approval, telemetry, and tool invocation behavior as much as possible. +- The design must fit naturally into the extension points that already exist in each SDK. +- The design must be safe for concurrent runs and must not rely on mutating shared agent configuration during invocation. +- The chosen structure should allow multiple backend-specific providers to fit under the same conceptual design over time, even though Hyperlight is the initial target. +- The abstraction should not assume that every backend is a VM-style sandbox; alternative execution models such as Pydantic's Monty should also fit. +- The design should allow `execute_code` to be reused both as a tool-enabled CodeAct runtime and as a standard code interpreter tool implementation. +- The design should remain open to alternative language/runtime modes, such as JavaScript on Hyperlight, rather than baking the abstraction to Python only. +- The design should provide a portable way to configure sandbox capabilities such as file access and network access, including allow-listed outbound domains. +- Using CodeAct should be optional, and installing its runtime or backend dependencies should also be optional. +- Backend-specific dependencies should be isolated behind a small adapter so SDK code is not tightly coupled to an unstable package surface. + +## Considered Options + +- **Option 1**: Standardize on context provider-based CodeAct with a shared cross-SDK contract and backend-specific public types +- **Option 2**: Implement CodeAct as a dedicated chat-client decorator/wrapper +- **Option 3**: Integrate CodeAct directly into the function invocation layer/FunctionInvokingChatClient + +## Pros and Cons of the Options + +### Option 1: Standardize on context provider-based CodeAct with a shared cross-SDK contract and backend-specific public types + +This option uses `ContextProvider` in Python and `AIContextProvider` in .NET, but standardizes the public concept and behavior. +In this option, the CodeAct tool set is provider-owned: only tools explicitly configured on the concrete CodeAct provider instance are available inside CodeAct, and the provider exposes direct CRUD-style management for tools, file mounts, and outbound network allow-list configuration rather than requiring a separate runtime setup object. +The agent's direct tool surface remains separate. If a tool should be available both through CodeAct and as a normal direct tool, it is configured in both places. + +- Good, because both SDKs already have first-class provider concepts intended for per-invocation context shaping. +- Good, because providers operate before model invocation, which is where CodeAct must add instructions and reshape tools. +- Good, because this lets us preserve existing function invocation behavior rather than rewriting it. +- Good, because slightly different internals are acceptable while the public behavior remains aligned. +- Good, because convenience builder/decorator helpers can still be added later on top of the provider model without changing the core design. +- Good, because backend-specific runtime logic can stay inside concrete provider implementations or internal helpers instead of being forced into a lowest-common-denominator public abstraction. +- Good, because the same provider structure can support either an all-or-nothing tool surface or a mixed side-by-side tool surface. +- Good, because users can keep some tools direct-only while allowing other tools to be used from inside CodeAct. +- Good, because a provider-owned CodeAct tool registry avoids mutating or inferring the agent's direct tool surface and can work consistently in both SDKs. +- Good, because the same conceptual design can remain open to `HyperlightCodeActContextProvider`, a future `MontyCodeActContextProvider`, and other backend-specific providers over time. +- Good, because `execute_code` can evolve into multiple backend-specific runtime modes rather than being hard-wired to one Python-plus-tools mode. +- Bad, because it is a bolt-on, which might make it less runtime efficient. + +### Option 2: Implement CodeAct as a dedicated chat-client decorator/wrapper + +This option would introduce a CodeAct-specific chat-client decorator that injects instructions and tools directly into the chat request pipeline. + +- Good, because this is a natural fit for .NET's `DelegatingChatClient` pipeline. +- Good, because it can also support advanced custom chat-client stacks. +- Good, because backend-specific runtime selection could be hidden inside the decorator implementation. +- Good, because the decorator could also encapsulate mode-specific instruction shaping for tool-enabled versus standalone interpreter behavior. +- Good, because the decorator can decide per request whether the tool surface is exclusive or mixed. +- Bad, because Python can support this by building a custom layering stack on top of a `Raw...Client` and swapping in a different `FunctionInvocationLayer`, but that composition path is more manual than the .NET `DelegatingChatClient` pipeline. +- Bad, because it duplicates responsibilities already handled by provider abstractions. +- Bad, because it makes CodeAct look more transport-specific than it really is. +- Bad, because swappable backends and reusable interpreter or language modes become coupled to chat-client composition rather than modeled as first-class CodeAct concepts. + +### Option 3: Integrate CodeAct directly into the function invocation layer/FunctionInvokingChatClient + +This option would push CodeAct into Python's `FunctionInvocationLayer` and .NET's `FunctionInvokingChatClient` or related middleware. + +- Good, because it is close to tool execution and can observe concrete tool invocation behavior. +- Good, because function middleware may still be useful later for auxiliary auditing or policy around sandbox-originated tool calls. +- Bad, because this is the wrong layer for constructing the model-facing tool surface and prompt instructions. +- Bad, because it does not naturally control whether the model sees an exclusive CodeAct tool surface or a mixed side-by-side tool surface. +- Bad, because it would still require a second mechanism for hiding normal tools and advertising `execute_code`. +- Bad, because it is a weak fit for standalone interpreter modes where no tool-calling loop is needed. +- Bad, because backend selection and CodeAct mode behavior are orthogonal concerns that do not belong in the function invocation layer. +- Bad, because `.NET` would become more tightly coupled to `FunctionInvokingChatClient`, which sits below the agent framework abstraction and is not the natural cross-SDK design seam. + +## Approval Model Options + +- **Option A**: Bundled approval for the `execute_code` invocation +- **Option B**: Pre-execution inspection of `call_tool(...)` references before approving `execute_code` +- **Option C**: Nested per-tool approvals during `execute_code` + +## Pros and Cons of the Approval Options + +### Option A: Bundled approval for the `execute_code` invocation + +This option grants approval once, before `execute_code` starts. Provider-owned tool calls made from inside that execution run under the same approval. The effective approval of `execute_code` is determined up front from the provider configuration rather than from inspecting which tools are actually called during execution. + +- Good, because it is the simplest model to explain and implement consistently in both SDKs. +- Good, because it fits naturally with long-running CodeAct loops where repeated approval interruptions would be disruptive. +- Good, because it does not require static code analysis before execution begins. +- Good, because it keeps the first release focused on the provider integration rather than a more complex approval engine. +- Bad, because approval is coarse-grained and may cover more activity than the user expected. +- Bad, because it provides less visibility into which provider-owned tools or capabilities will be exercised during the run. + +### Option B: Pre-execution inspection of `call_tool(...)` references before approving `execute_code` + +This option inspects submitted code for statically discoverable `call_tool("tool_name", ...)` references before execution starts and uses that information to shape the approval request. + +- Good, because it can show users more detail up front while still keeping approval at a single pre-execution moment. +- Good, because it matches the common case where tool names are spelled out directly in the generated code. +- Good, because it can coexist with bundled approval as a more informative variant of the same UX. +- Bad, because the analysis is inherently best-effort and cannot reliably predict dynamic behavior. +- Bad, because it requires duplicated parsing or inspection logic that does not replace runtime enforcement. + +### Option C: Nested per-tool approvals during `execute_code` + +This option requests approval when sandboxed code actually attempts to invoke a provider-owned tool that requires approval. + +- Good, because it aligns approval with real behavior rather than predicted behavior. +- Good, because it gives precise visibility into which provider-owned tools are being used. +- Good, because it can allow some tool calls while rejecting others within the same execution. +- Bad, because it interrupts long-running CodeAct flows and can degrade the user experience significantly. +- Bad, because it requires more complex runtime plumbing and approval UX in both SDKs. +- Bad, because repeated approval pauses may make CodeAct less useful for the exact long-running scenarios that motivate this feature. + +## Decision Outcomes + +### Decision 1: Integration seam and public structure + +Chosen option: **Option 1: Standardize on provider-based CodeAct with a shared cross-SDK contract and backend-specific public types**, because it is the only option that maps cleanly to both SDKs, lets us reshape instructions and tools before model invocation, and avoids invasive changes to the existing function invocation loops while still allowing multiple backend-specific providers and multiple runtime modes to fit under the same structure later. + +### Decision 2: Initial approval model + +Chosen option: **Option A: Bundled approval for the `execute_code` invocation**, because it is the smallest approval model that fits both SDKs, works well for long-running CodeAct flows, and does not force us to standardize a more complex inspection or policy engine in the first release. + +This follows the spirit of the current Python tool approval flow, where `FunctionTool` uses `approval_mode="always_require" | "never_require"` and the auto-invocation loop escalates the whole batch when any called tool requires approval. + +### Design summary + +We standardize the **public concept** of CodeAct across SDKs while allowing each SDK to use the extension point that fits it best. + +- Python uses a `ContextProvider`. +- .NET uses an `AIContextProvider`. +- The term **CodeAct context provider** is used throughout this ADR as a design concept, not as a required public base type. Public SDK APIs should prefer concrete backend-specific types such as `HyperlightCodeActContextProvider` rather than a public abstract `CodeActContextProvider` or a public `CodeActExecutor` parameter. +- CodeAct support should ship as an optional package in each SDK rather than as part of the core package, so users who do not need CodeAct do not take on its installation and dependency footprint. +- There is no separate runtime setup object in the chosen design. Concrete providers manage their provider-owned CodeAct tool registry, file mounts, and outbound network allow-list configuration directly through CRUD-style methods on the provider itself. +- At a high level, CodeAct is exposed through backend-specific context providers that contribute an `execute_code` tool, own the CodeAct-specific tool registry, and carry backend capability configuration such as filesystem and network access. +- The initial approval model is bundled approval for `execute_code`, using the same `approval_mode="always_require" | "never_require"` vocabulary as regular tools. +- The CodeAct provider exposes a default `approval_mode` for `execute_code`. If the provider default is `never_require`, the effective approval for `execute_code` is derived from the provider-owned CodeAct tool registry captured for the run. +- If every provider-owned CodeAct tool in that registry has `approval_mode="never_require"`, `execute_code` is treated as `never_require`. If any provider-owned CodeAct tool in that registry has `approval_mode="always_require"`, `execute_code` is treated as `always_require`, even if the generated code may not end up calling that tool. +- Approval is granted before `execute_code` starts, and provider-owned tool calls made from inside that execution run under the same approval. +- Direct-only agent tools do not affect the approval of `execute_code`; only the provider-owned CodeAct tool registry participates in that calculation. +- Configuring filesystem and network capability state on the provider, including adding file mounts or outbound network allow-list entries, is itself the approval for those capabilities in the initial model. +- Each `execute_code` invocation must start from a clean execution state. Exact caching, snapshot, and environment-reuse strategies are implementation details defined in the language-specific specs. +- The provider-based structure preserves room for future pre-execution inspection and nested per-tool approvals if later experience shows they are needed. +- Concrete backend-specific providers may still use small SDK-local helpers or adapters internally, but that split is an implementation detail rather than a public API requirement. + +Detailed language-specific implementation notes are specified in: + +- [Python implementation](../features/code_act/python-implementation.md) +- [.NET implementation](../features/code_act/dotnet-implementation.md) + +### Concrete provider implementation contract + +The design does not require a public abstract `CodeActContextProvider` base class, but it does require a stable implementation contract for concrete providers. + +- Concrete providers should expose a standard capability surface at construction time, with SDK-appropriate naming for: + - approval mode + - filesystem mode + - workspace root + - file mounts + - network mode + - allowed outbound domains + - allowed HTTP methods or an equivalent outbound policy surface +- Concrete providers should expose direct CRUD-style methods for managing the provider-owned CodeAct tool registry, file mounts, and outbound network allow-list configuration, rather than requiring callers to construct a separate runtime setup object. +- Concrete providers should implement their host SDK's provider lifecycle hooks to: + - build CodeAct instructions, + - add `execute_code`, + - snapshot the effective CodeAct tool registry and capability settings for the run, + - compute the effective approval requirement for `execute_code`, + - configure file access and network access for the backend, + - prepare or restore execution state, + - execute code, + - and translate backend output into framework-native content. +- Any internal abstract/helper surface shared by multiple concrete providers should standardize responsibilities for: + - instruction construction, + - file-access configuration, + - network-access configuration, + - environment preparation/restoration, + - code execution, + - and output-to-content conversion. +- Backend execution output should reuse existing framework-native content/message primitives rather than introducing backend-specific public result DTOs. + +## More Information + +### Related artifacts + +- Python implementation: [`docs/features/code_act/python-implementation.md`](../features/code_act/python-implementation.md) +- .NET implementation: [`docs/features/code_act/dotnet-implementation.md`](../features/code_act/dotnet-implementation.md) +- Python provider/session APIs: [`python/packages/core/agent_framework/_sessions.py`](../../python/packages/core/agent_framework/_sessions.py) +- Python function invocation loop: [`python/packages/core/agent_framework/_tools.py`](../../python/packages/core/agent_framework/_tools.py) +- .NET context provider abstraction: [`dotnet/src/Microsoft.Agents.AI.Abstractions/AIContextProvider.cs`](../../dotnet/src/Microsoft.Agents.AI.Abstractions/AIContextProvider.cs) +- .NET agent integration for context providers: [`dotnet/src/Microsoft.Agents.AI/ChatClient/ChatClientAgent.cs`](../../dotnet/src/Microsoft.Agents.AI/ChatClient/ChatClientAgent.cs) +- Optional .NET chat-client provider decorator: [`dotnet/src/Microsoft.Agents.AI/AIContextProviderDecorators/AIContextProviderChatClient.cs`](../../dotnet/src/Microsoft.Agents.AI/AIContextProviderDecorators/AIContextProviderChatClient.cs) +- .NET function invocation middleware seam: [`dotnet/src/Microsoft.Agents.AI/FunctionInvocationDelegatingAgentBuilderExtensions.cs`](../../dotnet/src/Microsoft.Agents.AI/FunctionInvocationDelegatingAgentBuilderExtensions.cs) + +### Related decisions + +- [0015-agent-run-context](0015-agent-run-context.md) +- [0016-python-context-middleware](0016-python-context-middleware.md) diff --git a/docs/features/code_act/dotnet-implementation.md b/docs/features/code_act/dotnet-implementation.md new file mode 100644 index 0000000000..087e564d62 --- /dev/null +++ b/docs/features/code_act/dotnet-implementation.md @@ -0,0 +1,6 @@ +# CodeAct .NET implementation + +This document will describe the .NET realization of the CodeAct design in +[`docs/decisions/0024-codeact-integration.md`](../../decisions/0024-codeact-integration.md). + +Coming soon. diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md new file mode 100644 index 0000000000..badd1ebac2 --- /dev/null +++ b/docs/features/code_act/python-implementation.md @@ -0,0 +1,340 @@ +# CodeAct Python implementation + +This document describes the Python realization of the CodeAct design in +[`docs/decisions/0024-codeact-integration.md`](../../decisions/0024-codeact-integration.md). + +This document is intentionally focused on the Python design and public API surface. +The initial public Python type described here is `HyperlightCodeActContextProvider`. Future Python backends, such as Monty, should follow the same conceptual model with their own concrete provider types rather than through a public abstract base class or a public executor parameter. + +## What is the goal of this feature? + +Goals: +- Python developers can enable CodeAct through a `ContextProvider`-based integration. +- Developers can configure a provider-owned CodeAct tool set that is separate from the agent's direct `tools=` surface. +- Developers can use the same `execute_code` concept for both tool-enabled CodeAct and a standard code interpreter tool implementation. +- Developers can swap execution backends over time, starting with Hyperlight while keeping room for alternatives such as Pydantic's Monty. +- Developers can configure execution capabilities such as file access, workspace mounts, and outbound network allow lists in a portable way. + +Success Metric: +- Python samples exist for both a tool-enabled CodeAct mode and a standard interpreter mode. + +Implementation-free outcome: +- A Python developer can attach a backend-specific CodeAct provider, choose which tools are available inside CodeAct, and configure execution capabilities without rewriting the function invocation loop. + +## What is the problem being solved? + +- Today, the easiest way to prototype CodeAct is to infer or reshape the agent's direct tool surface, which is fragile and hard to reason about. +- In Python, runtime tools and agent-default tools are reachable through awkward internal-looking surfaces, making tool provenance unreliable as an API contract. +- There is no first-class Python design that simultaneously covers Hyperlight-backed CodeAct now, future backend-specific providers such as Monty, and both tool-enabled and interpreter modes. +- Sandbox capabilities such as file access and network access need a portable configuration model instead of ad hoc backend-specific wiring. +- Approval behavior needs to be explicit and configurable, especially when CodeAct and direct tool calling may both be available. + +## API Changes + +### CodeAct contract + +#### Terminology + +- **CodeAct** is the primary term. +- **Code mode**, **codemode**, and **programmatic tool calling** refer to the same concept in this document. +- `execute_code` is the model-facing tool name used by the initial Python providers in this spec. + +#### Provider-owned CodeAct tool registry + +A concrete Python CodeAct provider owns the set of tools available through `call_tool(...)` inside CodeAct. + +Rules: +- Only tools explicitly configured on the concrete provider instance are available inside CodeAct. +- The provider must not infer its CodeAct-managed tool set from the agent's direct `tools=` configuration. +- Exclusive versus mixed behavior is achieved by where tools are configured, not by rewriting the agent's direct tool list. + +Implications: +- **CodeAct-only tool**: configured on the concrete CodeAct provider only. +- **Direct-only tool**: configured on the agent only. +- **Tool available both ways**: configured on both the agent and the concrete CodeAct provider. + +#### Managing tools and capabilities after provider construction + +There is no separate runtime setup object in the Python design. CodeAct tools, file mounts, and outbound network allow-list state are managed directly on the provider through CRUD-style registry methods. + +Preferred pattern: +- `add_tools(...) -> None` +- `get_tools() -> Sequence[ToolTypes]` +- `remove_tools(...) -> None` +- `clear_tools() -> None` +- `add_file_mounts(...) -> None` +- `get_file_mounts() -> Sequence[FileMount]` +- `remove_file_mounts(...) -> None` +- `clear_file_mounts() -> None` +- `add_allowed_domains(...) -> None` +- `get_allowed_domains() -> Sequence[str]` +- `remove_allowed_domains(...) -> None` +- `clear_allowed_domains() -> None` +- `add_allowed_http_methods(...) -> None` +- `get_allowed_http_methods() -> Sequence[str]` +- `remove_allowed_http_methods(...) -> None` +- `clear_allowed_http_methods() -> None` + +Requirements: +- The provider-owned CodeAct tool registry is keyed by tool name. +- `add_tools(...)` adds new tools and replaces an existing provider-owned registration when the same tool name is added again. +- `get_tools()` returns the provider's current configured CodeAct tool registry. +- `remove_tools(...)` removes provider-owned CodeAct tools by name. +- `clear_tools()` removes all provider-owned CodeAct tools. +- File mounts are keyed by sandbox mount path. +- `add_file_mounts(...)` adds new file mounts and replaces an existing mount when the same mount path is added again. +- `get_file_mounts()` returns the provider's current configured file mounts. +- `remove_file_mounts(...)` removes file mounts by mount path. +- `clear_file_mounts()` removes all configured file mounts. +- Allowed domains are keyed by normalized domain string. +- `add_allowed_domains(...)` adds domains to the outbound allow list. +- `get_allowed_domains()` returns the current outbound domain allow list. +- `remove_allowed_domains(...)` removes domains from the outbound allow list. +- `clear_allowed_domains()` removes all configured allowed domains. +- Allowed HTTP methods are keyed by normalized method name. +- `add_allowed_http_methods(...)` adds methods to the outbound method allow list. +- `get_allowed_http_methods()` returns the current outbound method allow list. +- `remove_allowed_http_methods(...)` removes methods from the outbound method allow list. +- `clear_allowed_http_methods()` removes all configured allowed HTTP methods. +- Tool, file-mount, and network-allow-list mutations affect subsequent runs only; runs already in progress keep the snapshot captured at run start. +- The provider must snapshot its effective tool registry and capability state at the start of each run so concurrent execution remains deterministic. + +#### Approval model + +The initial Python design follows the ADR's initial approval decision and reuses the existing tool approval vocabulary from `agent_framework._tools`: + +- `approval_mode="always_require"` +- `approval_mode="never_require"` + +The provider exposes a default `approval_mode` for `execute_code`. + +Effective `execute_code` approval is computed as follows: + +- If the provider default is `always_require`, `execute_code` requires approval. +- If the provider default is `never_require`, the provider evaluates the provider-owned CodeAct tool registry snapshot for that run. +- If every provider-owned CodeAct tool in that snapshot is `never_require`, `execute_code` is `never_require`. +- If any provider-owned CodeAct tool in that snapshot is `always_require`, `execute_code` is `always_require`, even if the generated code may not call that tool. +- Provider-owned tool calls made through `call_tool(...)` during that execution run use the approval already determined for `execute_code`. +- Direct-only agent tools are excluded from this calculation. +- File and network capabilities do not create a separate runtime approval check in the initial model; configuring them on the provider, including adding file mounts or outbound network allow-list entries, is itself the approval for those capabilities. + +This is intentionally conservative and matches the shape of the current function-tool approval flow, where `FunctionTool` uses `always_require` / `never_require` and the auto-invocation loop escalates the whole batch if any called tool requires approval. + +If the framework later standardizes pre-execution inspection or nested per-tool approvals, the Python provider surface can grow to expose that explicitly. The initial design does not assume that those extra modes are required. + +#### Shared execution flow + +On each run: +1. Resolve the provider's backend/runtime behavior, capabilities, provider default `approval_mode`, and provider-owned tool registry. +2. Compute the effective approval requirement for `execute_code` from the provider default plus the provider-owned tool registry snapshot. +3. Build provider-defined instructions. +4. Add `execute_code` to the model-facing tool surface. +5. Invoke the underlying model. +6. When `execute_code` is called, create or reuse an execution environment keyed by provider type, backend setup identity, capability configuration, and provider-owned tool signature. +7. If the current provider mode exposes host tools, expose `call_tool(...)` bound only to the provider-owned tool registry. +8. Execute code and convert results to framework-native content objects. + +Caching rules: +- Backends that support snapshots may cache a reusable clean snapshot. +- Backends that do not support snapshots may still cache warm initialization artifacts. +- No mutable per-run execution state may be shared across concurrent runs. + +### Python public API + +#### Core types + +```python +@dataclass(frozen=True) +class FileMount: + host_path: Path + mount_path: str + mode: Literal["read_only", "read_write"] = "read_only" + + +class HyperlightCodeActContextProvider(ContextProvider): + def __init__( + self, + source_id: str = "codeact", + *, + module: str | Path, + tools: ToolTypes | None = None, + approval_mode: Literal["always_require", "never_require"] = "never_require", + filesystem_mode: Literal["none", "read_only", "read_write"] = "none", + workspace_root: Path | None = None, + file_mounts: Sequence[FileMount] = (), + network_mode: Literal["none", "allow_list"] = "none", + allowed_domains: Sequence[str] = (), + allowed_http_methods: Sequence[str] = (), + ) -> None: ... + + def add_tools(self, tools: ToolTypes | Sequence[ToolTypes]) -> None: ... + def get_tools(self) -> Sequence[ToolTypes]: ... + def remove_tools(self, tool_names: str | Sequence[str]) -> None: ... + def clear_tools(self) -> None: ... + def add_file_mounts(self, mounts: FileMount | Sequence[FileMount]) -> None: ... + def get_file_mounts(self) -> Sequence[FileMount]: ... + def remove_file_mounts(self, mount_paths: str | Sequence[str]) -> None: ... + def clear_file_mounts(self) -> None: ... + def add_allowed_domains(self, domains: str | Sequence[str]) -> None: ... + def get_allowed_domains(self) -> Sequence[str]: ... + def remove_allowed_domains(self, domains: str | Sequence[str]) -> None: ... + def clear_allowed_domains(self) -> None: ... + def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: ... + def get_allowed_http_methods(self) -> Sequence[str]: ... + def remove_allowed_http_methods(self, methods: str | Sequence[str]) -> None: ... + def clear_allowed_http_methods(self) -> None: ... +``` + +No public abstract `CodeActContextProvider` base or public `executor=` parameter is required for the initial Python API. + +Provider modes: +- If no CodeAct-managed tools are configured, `HyperlightCodeActContextProvider` uses interpreter-style behavior. +- If one or more CodeAct-managed tools are configured, `HyperlightCodeActContextProvider` uses tool-enabled behavior. + +#### Python provider implementation contract + +The concrete provider plugs into the existing Python `ContextProvider` surface from `agent_framework._sessions`. + +Required lifecycle hooks: +- `before_run(*, agent, session, context, state) -> None` +- `after_run(*, agent, session, context, state) -> None` + +`before_run(...)` is responsible for: +- snapshotting the current CodeAct-managed tool registry and capability settings into `state`, +- computing the effective approval requirement for `execute_code` from the provider default and the snapshotted tool registry, +- building CodeAct instructions, +- adding `execute_code` to the run through `SessionContext.extend_tools(...)`, +- and wiring any backend-specific execution state needed for the run. + +`after_run(...)` is responsible for any backend-specific cleanup or post-processing that must happen after the model invocation completes. + +If shared internal helpers are introduced later for multiple concrete providers, they should standardize responsibilities for: +- building instructions, +- computing effective approval, +- configuring file access, +- configuring network access, +- preparing or restoring execution state, +- executing code, +- and converting backend output into framework-native `Content`. + +#### Runtime behavior + +- `before_run(...)` adds CodeAct instructions through `SessionContext.extend_instructions(...)`. +- `before_run(...)` adds `execute_code` through `SessionContext.extend_tools(...)`. +- `before_run(...)` builds Hyperlight-specific instructions from the current CodeAct tool registry and capability configuration. +- `execute_code` invokes the configured Hyperlight sandbox guest. +- If the current CodeAct tool registry is non-empty, the runtime injects `call_tool(...)` bound to the provider-owned tool registry. +- The provider does not inspect or mutate `Agent.default_options["tools"]` or `context.options["tools"]` to determine its CodeAct tool set. +- The provider snapshots the current CodeAct tool registry and capability state at run start, so later registry and allow-list mutations only affect future runs. +- Interpreter versus tool-enabled behavior is derived from the concrete provider and the presence of CodeAct-managed tools, not from a separate public profile object. + +#### Backend integration + +Initial public provider: +- `HyperlightCodeActContextProvider` + +Backend-specific notes: +- **Hyperlight** + - Provider construction needs a guest artifact via `module`, which may be a packaged guest module name or a path to a compiled guest artifact. + - File access maps naturally to Hyperlight Sandbox's read-only `/input` and writable `/output` capability model. + - Network access is denied by default and is enabled through allow-listed domains plus HTTP verbs. +- **Monty** + - A future `MontyCodeActContextProvider` should be a separate public type rather than a `HyperlightCodeActContextProvider` mode. + - Monty does not expose built-in filesystem or network access directly inside the interpreter. + - File and URL access are mediated through host-provided external functions, so a Monty provider would need to translate provider settings into virtual files and allow-checked callbacks. + - Monty setup may also include backend-specific inputs such as `script_name`, optional type-check stubs, or restored snapshots. + +#### Capability handling + +Capabilities are first-class `HyperlightCodeActContextProvider` init parameters and, for collection-shaped state, provider-managed CRUD surfaces: +- `filesystem_mode` +- `workspace_root` +- `file_mounts` +- `network_mode` +- `allowed_domains` +- `allowed_http_methods` + +Concrete providers should normalize these settings internally. Hyperlight can map them directly to sandbox capabilities, while Monty must enforce them through host-mediated file and network functions and may apply stricter URL-level checks than the public provider surface expresses. + +Expected management split: +- scalar policy settings such as `filesystem_mode`, `workspace_root`, and `network_mode` remain direct configuration values on the provider, +- file mounts are managed through provider CRUD methods, +- outbound domains are managed through provider CRUD methods, +- outbound HTTP methods are managed through provider CRUD methods. + +Enabling access means: +- `filesystem_mode="none"` disables file access from sandboxed code. +- `filesystem_mode="read_only"` or `"read_write"` enables file access within the mounted/workspace surface exposed by the provider. +- `network_mode="none"` disables outbound network access. +- `network_mode="allow_list"` enables outbound access only for the configured `allowed_domains` and `allowed_http_methods`. + +Backends may implement stricter semantics than these top-level settings. For example, Hyperlight naturally maps file access to `/input` and `/output`, while Monty would enforce equivalent policy through host-provided callbacks rather than direct interpreter I/O. + +#### Execution output representation + +Backend execution output should be translated into existing AF `Content` values rather than a custom `CodeActExecutionResult` type. + +Use the existing content model from `agent_framework._types`, for example: +- `Content.from_text(...)` for plain textual output, +- `Content.from_data(...)` or `Content.from_uri(...)` for generated files or binary artifacts, +- `Content.from_error(...)` for execution failures, +- `Content.from_shell_command_output(...)` when stdout/stderr/exit status need to stay structured, +- and `Content.from_function_result(..., result=list[Content])` when surfacing the final result of `execute_code` through the normal tool result path. + +#### `execute_code` input contract + +```json +{ + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "Code to execute using the provider's configured backend/runtime behavior." + } + }, + "required": ["code"] +} +``` + +Execution failures should surface readable error text and structured error `Content`, not a custom backend result object. + +## E2E Code Samples + +### Tool-enabled CodeAct mode + +```python +codeact = HyperlightCodeActContextProvider( + module="python_guest.path", + tools=[fetch_docs, query_data], + filesystem_mode="read_write", + workspace_root="./workdir", + network_mode="allow_list", + allowed_domains=["api.github.com"], + allowed_http_methods=["GET"], +) +codeact.add_tools([lookup_user]) + +agent = Agent( + client=client, + name="assistant", + tools=[send_email], # direct-only tool + context_providers=[codeact], +) +``` + +### Standard code interpreter mode + +```python +code_interpreter = HyperlightCodeActContextProvider( + module="python_guest.path", + filesystem_mode="read_only", + workspace_root="./data", + network_mode="none", +) + +agent = Agent( + client=client, + name="interpreter", + context_providers=[code_interpreter], +) +``` diff --git a/python/samples/02-agents/tools/codeact_context_provider.py b/python/samples/02-agents/tools/codeact_context_provider.py index 5d407d0fd6..b12527afe7 100644 --- a/python/samples/02-agents/tools/codeact_context_provider.py +++ b/python/samples/02-agents/tools/codeact_context_provider.py @@ -5,18 +5,7 @@ # "hyperlight-sandbox-backend-wasm", # "hyperlight-sandbox-python-guest", # ] -# [tool.uv.sources] -# hyperlight-sandbox = { index = "testpypi" } -# hyperlight-sandbox-backend-wasm = { index = "testpypi" } -# hyperlight-sandbox-python-guest = { index = "testpypi" } -# [[tool.uv.index]] -# name = "testpypi" -# url = "https://test.pypi.org/simple/" -# explicit = true # /// -# Bootstrap manually with: -# uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ -# hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest # Run with: uv run --python 3.12 samples/02-agents/tools/codeact_context_provider.py # # Copyright (c) Microsoft. All rights reserved. @@ -34,8 +23,8 @@ from agent_framework import ( Agent, AgentSession, - BaseContextProvider, Content, + ContextProvider, FunctionInvocationContext, FunctionTool, SessionContext, @@ -43,7 +32,7 @@ tool, ) from agent_framework._tools import normalize_tools -from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework.foundry import FoundryChatClient from azure.identity import AzureCliCredential from dotenv import load_dotenv @@ -247,7 +236,7 @@ def _build_codeact_instructions( """ -class CodeActContextProvider(BaseContextProvider): +class CodeActContextProvider(ContextProvider): """Inject a CodeAct surface using provider-owned tools. Tools passed to the provider are registered with the sandbox and made @@ -541,9 +530,9 @@ async def main() -> None: # available inside the sandbox via call_tool(...) and never appear as # separate model-facing tools. agent = Agent( - client=AzureOpenAIResponsesClient( - project_endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], - deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], + client=FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], credential=AzureCliCredential(), ), name="CodeActProviderAgent", diff --git a/python/samples/02-agents/tools/codeact_tool.py b/python/samples/02-agents/tools/codeact_tool.py index 119f983142..5f48a82f19 100644 --- a/python/samples/02-agents/tools/codeact_tool.py +++ b/python/samples/02-agents/tools/codeact_tool.py @@ -5,18 +5,7 @@ # "hyperlight-sandbox-backend-wasm", # "hyperlight-sandbox-python-guest", # ] -# [tool.uv.sources] -# hyperlight-sandbox = { index = "testpypi" } -# hyperlight-sandbox-backend-wasm = { index = "testpypi" } -# hyperlight-sandbox-python-guest = { index = "testpypi" } -# [[tool.uv.index]] -# name = "testpypi" -# url = "https://test.pypi.org/simple/" -# explicit = true # /// -# Bootstrap manually with: -# uv pip install --python 3.12 --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple \ -# hyperlight-sandbox hyperlight-sandbox-backend-wasm hyperlight-sandbox-python-guest # Run with: uv run --python 3.12 samples/02-agents/tools/codeact_tool.py # # Copyright (c) Microsoft. All rights reserved. @@ -33,7 +22,7 @@ from agent_framework import Agent, Content, FunctionTool, tool from agent_framework._tools import normalize_tools -from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework.foundry import FoundryChatClient from azure.identity import AzureCliCredential from dotenv import load_dotenv @@ -42,8 +31,8 @@ except ModuleNotFoundError as exc: raise RuntimeError( "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " - "implementation. Install the provisional Hyperlight packages from TestPyPI, " - "or update this sample to match the final import path." + "implementation. Install the provisional Hyperlight package once it " + "is available, or update this sample to match the final import path." ) from exc load_dotenv() @@ -52,7 +41,7 @@ """This sample demonstrates a direct-tool Hyperlight CodeAct prototype. -The sample creates an `Agent(client=AzureOpenAIResponsesClient(...), ...)` with a +The sample creates an `Agent(client=FoundryChatClient(...), ...)` with a primary `execute_code` tool plus schema-visible tools. It also supports per-run runtime tools by registering them with the sandbox before the run and passing them through `agent.run(..., tools=runtime_tools)`. @@ -295,9 +284,9 @@ async def execute_code( return sandbox_manager.run_code(code=code) agent = Agent( - client=AzureOpenAIResponsesClient( - project_endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], - deployment_name=os.environ["AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME"], + client=FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], credential=AzureCliCredential(), ), name="HyperlightCodeActToolAgent", From f8d959e10dd6ff564cfa84bd2a65b023c80567b9 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 15:25:40 +0200 Subject: [PATCH 06/17] initial implementation and adr and feature --- .../workflows/python-integration-tests.yml | 5 +- .github/workflows/python-merge-tests.yml | 4 +- docs/decisions/0024-codeact-integration.md | 4 +- .../code_act/python-implementation.md | 61 +- python/.cspell.json | 2 + python/PACKAGE_STATUS.md | 1 + .../packages/core/agent_framework/_tools.py | 9 +- .../packages/core/tests/core/test_agents.py | 4 +- python/packages/hyperlight/LICENSE | 21 + python/packages/hyperlight/README.md | 26 + .../agent_framework_hyperlight/__init__.py | 23 + .../_execute_code_tool.py | 700 ++++++++++++++++++ .../_instructions.py | 145 ++++ .../agent_framework_hyperlight/_provider.py | 133 ++++ .../agent_framework_hyperlight/_types.py | 18 + python/packages/hyperlight/pyproject.toml | 102 +++ python/packages/hyperlight/samples/README.md | 16 + .../samples/codeact_context_provider.py | 192 +++++ .../hyperlight/samples/codeact_tool.py | 110 +++ .../hyperlight/test_hyperlight_codeact.py | 382 ++++++++++ python/pyproject.toml | 1 + .../tools/codeact_context_provider.py | 601 --------------- .../samples/02-agents/tools/codeact_tool.py | 335 --------- python/uv.lock | 53 ++ 24 files changed, 1975 insertions(+), 973 deletions(-) create mode 100644 python/packages/hyperlight/LICENSE create mode 100644 python/packages/hyperlight/README.md create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/__init__.py create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/_instructions.py create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/_provider.py create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/_types.py create mode 100644 python/packages/hyperlight/pyproject.toml create mode 100644 python/packages/hyperlight/samples/README.md create mode 100644 python/packages/hyperlight/samples/codeact_context_provider.py create mode 100644 python/packages/hyperlight/samples/codeact_tool.py create mode 100644 python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py delete mode 100644 python/samples/02-agents/tools/codeact_context_provider.py delete mode 100644 python/samples/02-agents/tools/codeact_tool.py diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml index 523a763b62..0931be2c51 100644 --- a/.github/workflows/python-integration-tests.yml +++ b/.github/workflows/python-integration-tests.yml @@ -131,7 +131,7 @@ jobs: --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 - # Misc integration tests (Anthropic, Ollama, MCP) + # Misc integration tests (Anthropic, Hyperlight, Ollama, MCP) python-tests-misc-integration: name: Python Integration Tests - Misc runs-on: ubuntu-latest @@ -162,10 +162,11 @@ jobs: fallback_url: ${{ env.LOCAL_MCP_URL }} - name: Prefer local MCP URL when available run: echo "LOCAL_MCP_URL=${{ steps.local-mcp.outputs.effective_url }}" >> "$GITHUB_ENV" - - name: Test with pytest (Anthropic, Ollama, MCP integration) + - name: Test with pytest (Anthropic, Hyperlight, Ollama, MCP integration) run: > uv run pytest --import-mode=importlib packages/anthropic/tests + packages/hyperlight/tests packages/ollama/tests packages/core/tests/core/test_mcp.py -m integration diff --git a/.github/workflows/python-merge-tests.yml b/.github/workflows/python-merge-tests.yml index 4fc47af595..ccc2966c7e 100644 --- a/.github/workflows/python-merge-tests.yml +++ b/.github/workflows/python-merge-tests.yml @@ -65,6 +65,7 @@ jobs: - 'python/samples/**/providers/azure/**' misc: - 'python/packages/anthropic/**' + - 'python/packages/hyperlight/**' - 'python/packages/ollama/**' - 'python/packages/core/agent_framework/_mcp.py' - 'python/packages/core/tests/core/test_mcp.py' @@ -278,10 +279,11 @@ jobs: fallback_url: ${{ env.LOCAL_MCP_URL }} - name: Prefer local MCP URL when available run: echo "LOCAL_MCP_URL=${{ steps.local-mcp.outputs.effective_url }}" >> "$GITHUB_ENV" - - name: Test with pytest (Anthropic, Ollama, MCP integration) + - name: Test with pytest (Anthropic, Hyperlight, Ollama, MCP integration) run: > uv run pytest --import-mode=importlib packages/anthropic/tests + packages/hyperlight/tests packages/ollama/tests packages/core/tests/core/test_mcp.py -m integration diff --git a/docs/decisions/0024-codeact-integration.md b/docs/decisions/0024-codeact-integration.md index 8b9f6e1bd3..309b092db3 100644 --- a/docs/decisions/0024-codeact-integration.md +++ b/docs/decisions/0024-codeact-integration.md @@ -55,7 +55,7 @@ The agent's direct tool surface remains separate. If a tool should be available - Good, because the same provider structure can support either an all-or-nothing tool surface or a mixed side-by-side tool surface. - Good, because users can keep some tools direct-only while allowing other tools to be used from inside CodeAct. - Good, because a provider-owned CodeAct tool registry avoids mutating or inferring the agent's direct tool surface and can work consistently in both SDKs. -- Good, because the same conceptual design can remain open to `HyperlightCodeActContextProvider`, a future `MontyCodeActContextProvider`, and other backend-specific providers over time. +- Good, because the same conceptual design can remain open to `HyperlightCodeActProvider`, a future `MontyCodeActProvider`, and other backend-specific providers over time. - Good, because `execute_code` can evolve into multiple backend-specific runtime modes rather than being hard-wired to one Python-plus-tools mode. - Bad, because it is a bolt-on, which might make it less runtime efficient. @@ -144,7 +144,7 @@ We standardize the **public concept** of CodeAct across SDKs while allowing each - Python uses a `ContextProvider`. - .NET uses an `AIContextProvider`. -- The term **CodeAct context provider** is used throughout this ADR as a design concept, not as a required public base type. Public SDK APIs should prefer concrete backend-specific types such as `HyperlightCodeActContextProvider` rather than a public abstract `CodeActContextProvider` or a public `CodeActExecutor` parameter. +- The term **CodeAct context provider** is used throughout this ADR as a design concept, not as a required public base type. Public SDK APIs should prefer concrete backend-specific types such as `HyperlightCodeActProvider` rather than a public abstract `CodeActContextProvider` or a public `CodeActExecutor` parameter. - CodeAct support should ship as an optional package in each SDK rather than as part of the core package, so users who do not need CodeAct do not take on its installation and dependency footprint. - There is no separate runtime setup object in the chosen design. Concrete providers manage their provider-owned CodeAct tool registry, file mounts, and outbound network allow-list configuration directly through CRUD-style methods on the provider itself. - At a high level, CodeAct is exposed through backend-specific context providers that contribute an `execute_code` tool, own the CodeAct-specific tool registry, and carry backend capability configuration such as filesystem and network access. diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md index badd1ebac2..195d31a1b8 100644 --- a/docs/features/code_act/python-implementation.md +++ b/docs/features/code_act/python-implementation.md @@ -4,7 +4,7 @@ This document describes the Python realization of the CodeAct design in [`docs/decisions/0024-codeact-integration.md`](../../decisions/0024-codeact-integration.md). This document is intentionally focused on the Python design and public API surface. -The initial public Python type described here is `HyperlightCodeActContextProvider`. Future Python backends, such as Monty, should follow the same conceptual model with their own concrete provider types rather than through a public abstract base class or a public executor parameter. +The initial public Python type described here is `HyperlightCodeActProvider`. Future Python backends, such as Monty, should follow the same conceptual model with their own concrete provider types rather than through a public abstract base class or a public executor parameter. ## What is the goal of this feature? @@ -24,7 +24,7 @@ Implementation-free outcome: ## What is the problem being solved? - Today, the easiest way to prototype CodeAct is to infer or reshape the agent's direct tool surface, which is fragile and hard to reason about. -- In Python, runtime tools and agent-default tools are reachable through awkward internal-looking surfaces, making tool provenance unreliable as an API contract. +- In Python, inferring a CodeAct tool surface from generic agent tool configuration is fragile and hard to reason about. - There is no first-class Python design that simultaneously covers Hyperlight-backed CodeAct now, future backend-specific providers such as Monty, and both tool-enabled and interpreter modes. - Sandbox capabilities such as file access and network access need a portable configuration model instead of ad hoc backend-specific wiring. - Approval behavior needs to be explicit and configurable, especially when CodeAct and direct tool calling may both be available. @@ -146,17 +146,18 @@ Caching rules: ```python @dataclass(frozen=True) class FileMount: - host_path: Path + host_path: str | Path mount_path: str - mode: Literal["read_only", "read_write"] = "read_only" -class HyperlightCodeActContextProvider(ContextProvider): +class HyperlightCodeActProvider(ContextProvider): def __init__( self, - source_id: str = "codeact", + source_id: str = "hyperlight_codeact", *, - module: str | Path, + backend: str = "wasm", + module: str | None = "python_guest.path", + module_path: str | None = None, tools: ToolTypes | None = None, approval_mode: Literal["always_require", "never_require"] = "never_require", filesystem_mode: Literal["none", "read_only", "read_write"] = "none", @@ -169,43 +170,53 @@ class HyperlightCodeActContextProvider(ContextProvider): def add_tools(self, tools: ToolTypes | Sequence[ToolTypes]) -> None: ... def get_tools(self) -> Sequence[ToolTypes]: ... - def remove_tools(self, tool_names: str | Sequence[str]) -> None: ... + def remove_tool(self, name: str) -> None: ... def clear_tools(self) -> None: ... def add_file_mounts(self, mounts: FileMount | Sequence[FileMount]) -> None: ... def get_file_mounts(self) -> Sequence[FileMount]: ... - def remove_file_mounts(self, mount_paths: str | Sequence[str]) -> None: ... + def remove_file_mount(self, mount_path: str) -> None: ... def clear_file_mounts(self) -> None: ... def add_allowed_domains(self, domains: str | Sequence[str]) -> None: ... def get_allowed_domains(self) -> Sequence[str]: ... - def remove_allowed_domains(self, domains: str | Sequence[str]) -> None: ... + def remove_allowed_domain(self, domain: str) -> None: ... def clear_allowed_domains(self) -> None: ... def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: ... def get_allowed_http_methods(self) -> Sequence[str]: ... - def remove_allowed_http_methods(self, methods: str | Sequence[str]) -> None: ... + def remove_allowed_http_method(self, method: str) -> None: ... def clear_allowed_http_methods(self) -> None: ... ``` No public abstract `CodeActContextProvider` base or public `executor=` parameter is required for the initial Python API. +The initial alpha package also exports a standalone `HyperlightExecuteCodeTool` +for direct-tool scenarios where a provider is not needed. That standalone tool +should advertise `call_tool(...)`, the registered sandbox tools, and capability +state through its own `description` rather than requiring separate agent +instructions. + Provider modes: -- If no CodeAct-managed tools are configured, `HyperlightCodeActContextProvider` uses interpreter-style behavior. -- If one or more CodeAct-managed tools are configured, `HyperlightCodeActContextProvider` uses tool-enabled behavior. +- If no CodeAct-managed tools are configured, `HyperlightCodeActProvider` uses interpreter-style behavior. +- If one or more CodeAct-managed tools are configured, `HyperlightCodeActProvider` uses tool-enabled behavior. #### Python provider implementation contract The concrete provider plugs into the existing Python `ContextProvider` surface from `agent_framework._sessions`. -Required lifecycle hooks: +Required lifecycle hook: - `before_run(*, agent, session, context, state) -> None` + +Optional lifecycle hook: - `after_run(*, agent, session, context, state) -> None` `before_run(...)` is responsible for: -- snapshotting the current CodeAct-managed tool registry and capability settings into `state`, +- snapshotting the current CodeAct-managed tool registry and capability settings for the run, - computing the effective approval requirement for `execute_code` from the provider default and the snapshotted tool registry, -- building CodeAct instructions, +- adding a short CodeAct guidance block, - adding `execute_code` to the run through `SessionContext.extend_tools(...)`, - and wiring any backend-specific execution state needed for the run. +If the provider stores anything in `state`, that value must stay JSON-serializable. + `after_run(...)` is responsible for any backend-specific cleanup or post-processing that must happen after the model invocation completes. If shared internal helpers are introduced later for multiple concrete providers, they should standardize responsibilities for: @@ -219,9 +230,9 @@ If shared internal helpers are introduced later for multiple concrete providers, #### Runtime behavior -- `before_run(...)` adds CodeAct instructions through `SessionContext.extend_instructions(...)`. +- `before_run(...)` adds a short CodeAct guidance block through `SessionContext.extend_instructions(...)`. - `before_run(...)` adds `execute_code` through `SessionContext.extend_tools(...)`. -- `before_run(...)` builds Hyperlight-specific instructions from the current CodeAct tool registry and capability configuration. +- The detailed `call_tool(...)`, sandbox-tool, and capability guidance is carried by `execute_code.description`. - `execute_code` invokes the configured Hyperlight sandbox guest. - If the current CodeAct tool registry is non-empty, the runtime injects `call_tool(...)` bound to the provider-owned tool registry. - The provider does not inspect or mutate `Agent.default_options["tools"]` or `context.options["tools"]` to determine its CodeAct tool set. @@ -231,7 +242,7 @@ If shared internal helpers are introduced later for multiple concrete providers, #### Backend integration Initial public provider: -- `HyperlightCodeActContextProvider` +- `HyperlightCodeActProvider` Backend-specific notes: - **Hyperlight** @@ -239,14 +250,14 @@ Backend-specific notes: - File access maps naturally to Hyperlight Sandbox's read-only `/input` and writable `/output` capability model. - Network access is denied by default and is enabled through allow-listed domains plus HTTP verbs. - **Monty** - - A future `MontyCodeActContextProvider` should be a separate public type rather than a `HyperlightCodeActContextProvider` mode. + - A future `MontyCodeActProvider` should be a separate public type rather than a `HyperlightCodeActProvider` mode. - Monty does not expose built-in filesystem or network access directly inside the interpreter. - File and URL access are mediated through host-provided external functions, so a Monty provider would need to translate provider settings into virtual files and allow-checked callbacks. - Monty setup may also include backend-specific inputs such as `script_name`, optional type-check stubs, or restored snapshots. #### Capability handling -Capabilities are first-class `HyperlightCodeActContextProvider` init parameters and, for collection-shaped state, provider-managed CRUD surfaces: +Capabilities are first-class `HyperlightCodeActProvider` init parameters and, for collection-shaped state, provider-managed CRUD surfaces: - `filesystem_mode` - `workspace_root` - `file_mounts` @@ -275,10 +286,10 @@ Backends may implement stricter semantics than these top-level settings. For exa Backend execution output should be translated into existing AF `Content` values rather than a custom `CodeActExecutionResult` type. Use the existing content model from `agent_framework._types`, for example: +- `Content.from_code_interpreter_tool_result(outputs=[...])` to surface the overall result of sandboxed code execution, - `Content.from_text(...)` for plain textual output, - `Content.from_data(...)` or `Content.from_uri(...)` for generated files or binary artifacts, - `Content.from_error(...)` for execution failures, -- `Content.from_shell_command_output(...)` when stdout/stderr/exit status need to stay structured, - and `Content.from_function_result(..., result=list[Content])` when surfacing the final result of `execute_code` through the normal tool result path. #### `execute_code` input contract @@ -303,8 +314,7 @@ Execution failures should surface readable error text and structured error `Cont ### Tool-enabled CodeAct mode ```python -codeact = HyperlightCodeActContextProvider( - module="python_guest.path", +codeact = HyperlightCodeActProvider( tools=[fetch_docs, query_data], filesystem_mode="read_write", workspace_root="./workdir", @@ -325,8 +335,7 @@ agent = Agent( ### Standard code interpreter mode ```python -code_interpreter = HyperlightCodeActContextProvider( - module="python_guest.path", +code_interpreter = HyperlightCodeActProvider( filesystem_mode="read_only", workspace_root="./data", network_mode="none", diff --git a/python/.cspell.json b/python/.cspell.json index a26cc7fed7..b72fa96cf5 100644 --- a/python/.cspell.json +++ b/python/.cspell.json @@ -30,6 +30,7 @@ "azuredocs", "azurefunctions", "boto", + "codeact", "contentvector", "contoso", "datamodel", @@ -45,6 +46,7 @@ "hnsw", "httpx", "huggingface", + "hyperlight", "Instrumentor", "logit", "logprobs", diff --git a/python/PACKAGE_STATUS.md b/python/PACKAGE_STATUS.md index 7a726812ff..7681ae1d0d 100644 --- a/python/PACKAGE_STATUS.md +++ b/python/PACKAGE_STATUS.md @@ -32,6 +32,7 @@ Status is grouped into these buckets: | `agent-framework-foundry` | `python/packages/foundry` | `released` | | `agent-framework-foundry-local` | `python/packages/foundry_local` | `beta` | | `agent-framework-github-copilot` | `python/packages/github_copilot` | `beta` | +| `agent-framework-hyperlight` | `python/packages/hyperlight` | `alpha` | | `agent-framework-lab` | `python/packages/lab` | `beta` | | `agent-framework-mem0` | `python/packages/mem0` | `beta` | | `agent-framework-ollama` | `python/packages/ollama` | `beta` | diff --git a/python/packages/core/agent_framework/_tools.py b/python/packages/core/agent_framework/_tools.py index 6cdc74b313..3d119413d0 100644 --- a/python/packages/core/agent_framework/_tools.py +++ b/python/packages/core/agent_framework/_tools.py @@ -89,6 +89,7 @@ DEFAULT_MAX_ITERATIONS: Final[int] = 40 DEFAULT_MAX_CONSECUTIVE_ERRORS_PER_REQUEST: Final[int] = 3 SHELL_TOOL_KIND_VALUE: Final[str] = "shell" +ApprovalMode: TypeAlias = Literal["always_require", "never_require"] ChatClientT = TypeVar("ChatClientT", bound="SupportsChatGetResponse[Any]") ResponseModelBoundT = TypeVar("ResponseModelBoundT", bound=BaseModel) @@ -270,7 +271,7 @@ def __init__( *, name: str, description: str = "", - approval_mode: Literal["always_require", "never_require"] | None = None, + approval_mode: ApprovalMode | None = None, kind: str | None = None, max_invocations: int | None = None, max_invocation_exceptions: int | None = None, @@ -1030,7 +1031,7 @@ def tool( name: str | None = None, description: str | None = None, schema: type[BaseModel] | Mapping[str, Any] | None = None, - approval_mode: Literal["always_require", "never_require"] | None = None, + approval_mode: ApprovalMode | None = None, kind: str | None = None, max_invocations: int | None = None, max_invocation_exceptions: int | None = None, @@ -1046,7 +1047,7 @@ def tool( name: str | None = None, description: str | None = None, schema: type[BaseModel] | Mapping[str, Any] | None = None, - approval_mode: Literal["always_require", "never_require"] | None = None, + approval_mode: ApprovalMode | None = None, kind: str | None = None, max_invocations: int | None = None, max_invocation_exceptions: int | None = None, @@ -1061,7 +1062,7 @@ def tool( name: str | None = None, description: str | None = None, schema: type[BaseModel] | Mapping[str, Any] | None = None, - approval_mode: Literal["always_require", "never_require"] | None = None, + approval_mode: ApprovalMode | None = None, kind: str | None = None, max_invocations: int | None = None, max_invocation_exceptions: int | None = None, diff --git a/python/packages/core/tests/core/test_agents.py b/python/packages/core/tests/core/test_agents.py index ff57f960ce..1358a997a5 100644 --- a/python/packages/core/tests/core/test_agents.py +++ b/python/packages/core/tests/core/test_agents.py @@ -846,7 +846,7 @@ async def test_context_provider_can_inspect_runtime_tools_from_run( ) -> None: seen_tools: list[Any] = [] - class RuntimeToolsProvider(BaseContextProvider): + class RuntimeToolsProvider(ContextProvider): def __init__(self) -> None: super().__init__(source_id="runtime-tools") @@ -876,7 +876,7 @@ async def before_run(self, *, agent: Any, session: Any, context: Any, state: Any async def test_context_provider_can_remove_runtime_tools_from_run( chat_client_base: SupportsChatGetResponse, ) -> None: - class RuntimeToolsProvider(BaseContextProvider): + class RuntimeToolsProvider(ContextProvider): def __init__(self) -> None: super().__init__(source_id="runtime-tools") diff --git a/python/packages/hyperlight/LICENSE b/python/packages/hyperlight/LICENSE new file mode 100644 index 0000000000..9e841e7a26 --- /dev/null +++ b/python/packages/hyperlight/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/python/packages/hyperlight/README.md b/python/packages/hyperlight/README.md new file mode 100644 index 0000000000..d4181808be --- /dev/null +++ b/python/packages/hyperlight/README.md @@ -0,0 +1,26 @@ +# agent-framework-hyperlight + +Alpha Hyperlight-backed CodeAct integrations for Microsoft Agent Framework. + +## Installation + +```bash +pip install agent-framework-hyperlight --pre +``` + +This package depends on `hyperlight-sandbox`, the packaged Python guest, and the +Wasm backend package on supported platforms. If the backend is not published for +your current platform yet, `execute_code` will fail at runtime when it tries to +create the sandbox. + +## Public API + +- `HyperlightCodeActProvider` +- `HyperlightExecuteCodeTool` +- `FileMount` + +## Notes + +- This package is intentionally separate from `agent-framework-core` so CodeAct + usage and installation remain optional. +- Alpha-package samples live under `packages/hyperlight/samples/`. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/__init__.py b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py new file mode 100644 index 0000000000..8bdc5a1467 --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import importlib.metadata + +from ._execute_code_tool import HyperlightExecuteCodeTool +from ._provider import HyperlightCodeActProvider +from ._types import FileMount, FilesystemMode, NetworkMode + +try: + __version__ = importlib.metadata.version(__name__) +except importlib.metadata.PackageNotFoundError: + __version__ = "0.0.0" + +__all__ = [ + "FileMount", + "FilesystemMode", + "HyperlightCodeActProvider", + "HyperlightExecuteCodeTool", + "NetworkMode", + "__version__", +] diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py new file mode 100644 index 0000000000..0ed9e695c7 --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -0,0 +1,700 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import ast +import copy +import mimetypes +import shutil +import threading +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from pathlib import Path, PurePosixPath +from tempfile import TemporaryDirectory +from typing import Annotated, Any, Protocol +from urllib.parse import urlparse + +from agent_framework import Content, FunctionTool +from agent_framework._tools import ApprovalMode, normalize_tools +from pydantic import BaseModel, Field + +from ._instructions import build_codeact_instructions, build_execute_code_description +from ._types import FileMount, FilesystemMode, NetworkMode + +DEFAULT_HYPERLIGHT_BACKEND = "wasm" +DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" +EXECUTE_CODE_INPUT_DESCRIPTION = "Python code to execute in an isolated Hyperlight sandbox." + + +class _ExecuteCodeInput(BaseModel): + code: Annotated[str, Field(description=EXECUTE_CODE_INPUT_DESCRIPTION)] + + +@dataclass(frozen=True, slots=True) +class _StoredFileMount: + host_path: Path + mount_path: str + + +@dataclass(frozen=True, slots=True) +class _NormalizedFileMount: + host_path: Path + mount_path: str + path_signature: tuple[tuple[str, int, int], ...] + + +@dataclass(frozen=True, slots=True) +class _RunConfig: + backend: str + module: str | None + module_path: str | None + approval_mode: ApprovalMode + tools: tuple[FunctionTool, ...] + filesystem_mode: FilesystemMode + workspace_root: Path | None + workspace_signature: tuple[tuple[str, int, int], ...] + file_mounts: tuple[_NormalizedFileMount, ...] + network_mode: NetworkMode + allowed_domains: tuple[str, ...] + allowed_http_methods: tuple[str, ...] + + @property + def mounted_paths(self) -> tuple[str, ...]: + return tuple(_display_mount_path(mount.mount_path) for mount in self.file_mounts) + + def cache_key(self) -> tuple[Any, ...]: + return ( + self.backend, + self.module, + self.module_path, + self.approval_mode, + tuple((tool_obj.name, id(tool_obj)) for tool_obj in self.tools), + self.filesystem_mode, + str(self.workspace_root) if self.workspace_root is not None else None, + self.workspace_signature, + tuple((mount.mount_path, str(mount.host_path), mount.path_signature) for mount in self.file_mounts), + self.network_mode, + self.allowed_domains, + self.allowed_http_methods, + ) + + +class SandboxRuntime(Protocol): + def execute(self, *, config: _RunConfig, code: str) -> list[Content]: ... + + +@dataclass +class _SandboxEntry: + sandbox: Any + snapshot: Any + input_dir: TemporaryDirectory[str] | None + output_dir: TemporaryDirectory[str] | None + lock: threading.RLock + + +def _load_sandbox_class() -> type[Any]: + try: + from hyperlight_sandbox import Sandbox + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "Hyperlight support requires `hyperlight-sandbox`, `hyperlight-sandbox-python-guest`, " + "and a compatible backend package such as `hyperlight-sandbox-backend-wasm`." + ) from exc + + return Sandbox + + +def _passthrough_result_parser(result: Any) -> str: + return repr(result) + + +def _collect_tools(*tool_groups: Any) -> list[FunctionTool]: + tools: list[FunctionTool] = [] + seen_names: set[str] = set() + + for tool_group in tool_groups: + normalized_group = normalize_tools(tool_group) + for tool_obj in normalized_group: + if not isinstance(tool_obj, FunctionTool): + continue + if tool_obj.name == "execute_code" or tool_obj.name in seen_names: + continue + seen_names.add(tool_obj.name) + tools.append(tool_obj) + + return tools + + +def _resolve_execute_code_approval_mode( + *, + base_approval_mode: ApprovalMode, + tools: Sequence[FunctionTool], +) -> ApprovalMode: + if base_approval_mode == "always_require": + return "always_require" + + if any(tool_obj.approval_mode == "always_require" for tool_obj in tools): + return "always_require" + + return "never_require" + + +def _resolve_existing_path(value: str | Path) -> Path: + return Path(value).expanduser().resolve(strict=True) + + +def _resolve_workspace_root(value: str | Path | None) -> Path | None: + if value is None: + return None + + resolved_path = _resolve_existing_path(value) + if not resolved_path.is_dir(): + raise ValueError("workspace_root must point to an existing directory.") + return resolved_path + + +def _normalize_domain(target: str) -> str: + candidate = target.strip() + if not candidate: + raise ValueError("Domain entries must not be empty.") + + parsed = urlparse(candidate if "://" in candidate else f"//{candidate}") + normalized = (parsed.netloc or parsed.path).strip().rstrip("/") + if not normalized: + raise ValueError(f"Could not normalize domain entry: {target!r}.") + return normalized.lower() + + +def _normalize_http_method(method: str) -> str: + normalized = method.strip().upper() + if not normalized: + raise ValueError("HTTP method entries must not be empty.") + return normalized + + +def _normalize_mount_path(mount_path: str) -> str: + raw_path = mount_path.strip().replace("\\", "/") + if not raw_path: + raise ValueError("mount_path must not be empty.") + + pure_path = PurePosixPath(raw_path) + parts = [part for part in pure_path.parts if part not in {"", "/", "."}] + if parts and parts[0] == "input": + parts = parts[1:] + if any(part == ".." for part in parts): + raise ValueError("mount_path must stay within /input.") + if not parts: + raise ValueError("mount_path must point to a concrete path under /input.") + return "/".join(parts) + + +def _display_mount_path(mount_path: str) -> str: + return f"/input/{mount_path}" + + +def _path_tree_signature(path: Path) -> tuple[tuple[str, int, int], ...]: + if path.is_file(): + stat = path.stat() + return ((path.name, int(stat.st_size), int(stat.st_mtime_ns)),) + + entries: list[tuple[str, int, int]] = [] + for candidate in sorted(path.rglob("*"), key=lambda value: value.as_posix()): + try: + stat = candidate.stat() + except FileNotFoundError: + continue + relative_path = candidate.relative_to(path).as_posix() + size = int(stat.st_size) if candidate.is_file() else 0 + entries.append((relative_path, size, int(stat.st_mtime_ns))) + return tuple(entries) + + +def _copy_path(source: Path, destination: Path) -> None: + if source.is_dir(): + destination.mkdir(parents=True, exist_ok=True) + for child in sorted(source.iterdir(), key=lambda value: value.name): + _copy_path(child, destination / child.name) + return + + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) + + +def _populate_input_dir(*, config: _RunConfig, input_root: Path) -> None: + if config.workspace_root is not None: + for child in sorted(config.workspace_root.iterdir(), key=lambda value: value.name): + _copy_path(child, input_root / child.name) + + for mount in config.file_mounts: + _copy_path(mount.host_path, input_root / mount.mount_path) + + +def _create_file_content(file_path: Path, *, relative_path: str) -> Content: + media_type = mimetypes.guess_type(file_path.name)[0] or "application/octet-stream" + return Content.from_data( + data=file_path.read_bytes(), + media_type=media_type, + additional_properties={"path": f"/output/{relative_path}"}, + ) + + +def _parse_output_files(*, sandbox: Any, output_dir: TemporaryDirectory[str] | None) -> list[Content]: + if output_dir is None or not hasattr(sandbox, "get_output_files"): + return [] + + try: + output_files = sandbox.get_output_files() + except Exception: + return [] + + contents: list[Content] = [] + root = Path(output_dir.name) + + for output_file in output_files: + raw_path = str(output_file).replace("\\", "/") + pure_path = PurePosixPath(raw_path) + parts = [part for part in pure_path.parts if part not in {"", "/", "."}] + if parts and parts[0] == "output": + parts = parts[1:] + if not parts or any(part == ".." for part in parts): + continue + + relative_path = "/".join(parts) + host_path = root.joinpath(*parts) + if host_path.is_file(): + contents.append(_create_file_content(host_path, relative_path=relative_path)) + + return contents + + +def _build_execution_contents( + *, + result: Any, + sandbox: Any, + output_dir: TemporaryDirectory[str] | None, +) -> list[Content]: + success = bool(getattr(result, "success", False)) + stdout = str(getattr(result, "stdout", "") or "").replace("\r\n", "\n") or None + stderr = str(getattr(result, "stderr", "") or "").replace("\r\n", "\n") or None + outputs: list[Content] = [] + + if stdout is not None: + outputs.append(Content.from_text(stdout, raw_representation=result)) + + outputs.extend(_parse_output_files(sandbox=sandbox, output_dir=output_dir)) + + if success: + if stderr is not None: + outputs.append(Content.from_text(stderr, raw_representation=result)) + if not outputs: + outputs.append(Content.from_text("Code executed successfully without output.")) + return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)] + + error_details = stderr or "Unknown sandbox error" + outputs.append( + Content.from_error( + message="Execution error", + error_details=error_details, + raw_representation=result, + ) + ) + return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)] + + +def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: + sandbox_tool = copy.copy(tool_obj) + sandbox_tool.result_parser = _passthrough_result_parser + + async def _callback(**kwargs: Any) -> Any: + contents = await sandbox_tool.invoke(arguments=kwargs) + + values: list[Any] = [] + for content in contents: + if content.type == "text" and content.text is not None: + try: + values.append(ast.literal_eval(content.text)) + except (SyntaxError, ValueError): + values.append(content.text) + continue + + values.append(content.to_dict()) + + if len(values) == 1: + return values[0] + return values + + return _callback + + +class _SandboxRegistry: + def __init__(self) -> None: + self._entries: dict[tuple[Any, ...], _SandboxEntry] = {} + self._entries_lock = threading.RLock() + + def execute(self, *, config: _RunConfig, code: str) -> list[Content]: + cache_key = config.cache_key() + with self._entries_lock: + entry = self._entries.get(cache_key) + if entry is None: + entry = self._create_entry(config) + self._entries[cache_key] = entry + + with entry.lock: + entry.sandbox.restore(entry.snapshot) + result = entry.sandbox.run(code=code) + return _build_execution_contents(result=result, sandbox=entry.sandbox, output_dir=entry.output_dir) + + def _create_entry(self, config: _RunConfig) -> _SandboxEntry: + input_dir_handle = TemporaryDirectory() if config.filesystem_mode != "none" else None + output_dir_handle = TemporaryDirectory() if config.filesystem_mode == "read_write" else None + + if input_dir_handle is not None: + _populate_input_dir(config=config, input_root=Path(input_dir_handle.name)) + + sandbox_cls = _load_sandbox_class() + try: + sandbox = sandbox_cls( + backend=config.backend, + module=config.module, + module_path=config.module_path, + input_dir=input_dir_handle.name if input_dir_handle is not None else None, + output_dir=output_dir_handle.name if output_dir_handle is not None else None, + ) + except ImportError as exc: + raise RuntimeError( + "The selected Hyperlight backend is not installed or not supported on this platform. " + "Install a compatible backend package, such as `hyperlight-sandbox-backend-wasm`." + ) from exc + + for tool_obj in config.tools: + sandbox.register_tool(tool_obj.name, _make_sandbox_callback(tool_obj)) + + if config.network_mode == "allow_list": + methods = list(config.allowed_http_methods) or None + for domain in config.allowed_domains: + sandbox.allow_domain(domain, methods=methods) + + sandbox.run("None") + snapshot = sandbox.snapshot() + return _SandboxEntry( + sandbox=sandbox, + snapshot=snapshot, + input_dir=input_dir_handle, + output_dir=output_dir_handle, + lock=threading.RLock(), + ) + + +class HyperlightExecuteCodeTool(FunctionTool): + """Execute Python code inside a Hyperlight sandbox.""" + + def __init__( + self, + *, + tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]] | None = None, + approval_mode: ApprovalMode | None = None, + filesystem_mode: FilesystemMode = "none", + workspace_root: str | Path | None = None, + file_mounts: FileMount | Sequence[FileMount] | None = None, + network_mode: NetworkMode = "none", + allowed_domains: str | Sequence[str] | None = None, + allowed_http_methods: str | Sequence[str] | None = None, + backend: str = DEFAULT_HYPERLIGHT_BACKEND, + module: str | None = DEFAULT_HYPERLIGHT_MODULE, + module_path: str | None = None, + _registry: SandboxRuntime | None = None, + ) -> None: + super().__init__( + name="execute_code", + description=EXECUTE_CODE_INPUT_DESCRIPTION, + approval_mode="never_require", + func=self._run_code, + input_model=_ExecuteCodeInput, + ) + self._state_lock = threading.RLock() + self._registry = _registry or _SandboxRegistry() + self._default_approval_mode: ApprovalMode = approval_mode or "never_require" + self._filesystem_mode: FilesystemMode = filesystem_mode + self._workspace_root = _resolve_workspace_root(workspace_root) + if self._filesystem_mode == "none" and self._workspace_root is not None: + raise ValueError("workspace_root requires filesystem_mode to be 'read_only' or 'read_write'.") + self._network_mode: NetworkMode = network_mode + self._backend: str = backend + self._module: str | None = module + self._module_path: str | None = module_path + self._managed_tools: list[FunctionTool] = [] + self._file_mounts: dict[str, _StoredFileMount] = {} + self._allowed_domains: set[str] = set() + self._allowed_http_methods: set[str] = set() + + if tools is not None: + self.add_tools(tools) + if file_mounts is not None: + self.add_file_mounts(file_mounts) + if allowed_http_methods is not None: + self.add_allowed_http_methods(allowed_http_methods) + if allowed_domains is not None: + self.add_allowed_domains(allowed_domains) + + self._refresh_approval_mode() + + @property + def description(self) -> str: + state_lock = getattr(self, "_state_lock", None) + if state_lock is None: + return str(self.__dict__.get("description", EXECUTE_CODE_INPUT_DESCRIPTION)) + + with state_lock: + return build_execute_code_description( + tools=self._managed_tools, + filesystem_mode=self._filesystem_mode, + workspace_enabled=self._workspace_root is not None, + mounted_paths=[_display_mount_path(mount.mount_path) for mount in self._file_mounts.values()], + network_mode=self._network_mode, + allowed_domains=sorted(self._allowed_domains), + allowed_http_methods=sorted(self._allowed_http_methods), + ) + + @description.setter + def description(self, value: str) -> None: + self.__dict__["description"] = value + + def add_tools( + self, + tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]], + ) -> None: + """Add sandbox-managed tools to this execute_code surface.""" + with self._state_lock: + combined_tools = _collect_tools(self._managed_tools, tools) + self._managed_tools = combined_tools + self._refresh_approval_mode() + + def get_tools(self) -> list[FunctionTool]: + """Return the currently managed sandbox tools.""" + with self._state_lock: + return list(self._managed_tools) + + def remove_tool(self, name: str) -> None: + """Remove one managed sandbox tool by name.""" + with self._state_lock: + remaining_tools = [tool_obj for tool_obj in self._managed_tools if tool_obj.name != name] + if len(remaining_tools) == len(self._managed_tools): + raise KeyError(f"No managed tool named {name!r} is registered.") + self._managed_tools = remaining_tools + self._refresh_approval_mode() + + def clear_tools(self) -> None: + """Remove all managed sandbox tools.""" + with self._state_lock: + self._managed_tools = [] + self._refresh_approval_mode() + + def add_file_mounts(self, file_mounts: FileMount | Sequence[FileMount]) -> None: + """Add one or more file mounts under `/input`.""" + if self._filesystem_mode == "none": + raise ValueError("File mounts require filesystem_mode to be 'read_only' or 'read_write'.") + + mounts = [file_mounts] if isinstance(file_mounts, FileMount) else list(file_mounts) + normalized_mounts = [ + _StoredFileMount( + host_path=_resolve_existing_path(mount.host_path), + mount_path=_normalize_mount_path(mount.mount_path), + ) + for mount in mounts + ] + + with self._state_lock: + for mount in normalized_mounts: + self._file_mounts[mount.mount_path] = mount + + def get_file_mounts(self) -> list[FileMount]: + """Return the configured file mounts.""" + with self._state_lock: + return [ + FileMount(host_path=mount.host_path, mount_path=_display_mount_path(mount.mount_path)) + for mount in self._file_mounts.values() + ] + + def remove_file_mount(self, mount_path: str) -> None: + """Remove one file mount by its sandbox path.""" + normalized_mount_path = _normalize_mount_path(mount_path) + with self._state_lock: + if normalized_mount_path not in self._file_mounts: + raise KeyError(f"No file mount exists for {mount_path!r}.") + del self._file_mounts[normalized_mount_path] + + def clear_file_mounts(self) -> None: + """Remove all configured file mounts.""" + with self._state_lock: + self._file_mounts.clear() + + def add_allowed_domains(self, domains: str | Sequence[str]) -> None: + """Add one or more outbound allow-list domains.""" + if self._network_mode == "none": + raise ValueError("Allowed domains require network_mode='allow_list'.") + + normalized_domains = ( + {_normalize_domain(domains)} + if isinstance(domains, str) + else {_normalize_domain(domain) for domain in domains} + ) + with self._state_lock: + self._allowed_domains.update(normalized_domains) + + def get_allowed_domains(self) -> list[str]: + """Return the configured outbound allow-list domains.""" + with self._state_lock: + return sorted(self._allowed_domains) + + def remove_allowed_domain(self, domain: str) -> None: + """Remove one outbound allow-list domain.""" + normalized_domain = _normalize_domain(domain) + with self._state_lock: + if normalized_domain not in self._allowed_domains: + raise KeyError(f"No allowed domain exists for {domain!r}.") + self._allowed_domains.remove(normalized_domain) + + def clear_allowed_domains(self) -> None: + """Remove all outbound allow-list domains.""" + with self._state_lock: + self._allowed_domains.clear() + + def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: + """Add one or more outbound HTTP methods for the allow-list policy.""" + if self._network_mode == "none": + raise ValueError("Allowed HTTP methods require network_mode='allow_list'.") + + normalized_methods = ( + {_normalize_http_method(methods)} + if isinstance(methods, str) + else {_normalize_http_method(method) for method in methods} + ) + with self._state_lock: + self._allowed_http_methods.update(normalized_methods) + + def get_allowed_http_methods(self) -> list[str]: + """Return the configured outbound allow-list HTTP methods.""" + with self._state_lock: + return sorted(self._allowed_http_methods) + + def remove_allowed_http_method(self, method: str) -> None: + """Remove one outbound allow-list HTTP method.""" + normalized_method = _normalize_http_method(method) + with self._state_lock: + if normalized_method not in self._allowed_http_methods: + raise KeyError(f"No allowed HTTP method exists for {method!r}.") + self._allowed_http_methods.remove(normalized_method) + + def clear_allowed_http_methods(self) -> None: + """Remove all outbound allow-list HTTP methods.""" + with self._state_lock: + self._allowed_http_methods.clear() + + def build_instructions(self, *, tools_visible_to_model: bool) -> str: + """Build the current CodeAct instructions for this execute_code surface.""" + config = self._build_run_config() + return build_codeact_instructions( + tools=config.tools, + tools_visible_to_model=tools_visible_to_model, + filesystem_mode=config.filesystem_mode, + workspace_enabled=config.workspace_root is not None, + mounted_paths=config.mounted_paths, + network_mode=config.network_mode, + allowed_domains=config.allowed_domains, + allowed_http_methods=config.allowed_http_methods, + ) + + def create_run_tool(self) -> HyperlightExecuteCodeTool: + """Create a run-scoped snapshot of this execute_code surface.""" + file_mounts = self.get_file_mounts() + allowed_domains = self.get_allowed_domains() + allowed_http_methods = self.get_allowed_http_methods() + + return HyperlightExecuteCodeTool( + tools=self.get_tools(), + approval_mode=self._default_approval_mode, + filesystem_mode=self._filesystem_mode, + workspace_root=self._workspace_root, + file_mounts=file_mounts or None, + network_mode=self._network_mode, + allowed_domains=allowed_domains or None, + allowed_http_methods=allowed_http_methods or None, + backend=self._backend, + module=self._module, + module_path=self._module_path, + _registry=self._registry, + ) + + def build_serializable_state(self) -> dict[str, Any]: + """Return a JSON-serializable snapshot of the effective run state.""" + config = self._build_run_config() + return { + "backend": config.backend, + "module": config.module, + "module_path": config.module_path, + "approval_mode": config.approval_mode, + "tool_names": [tool_obj.name for tool_obj in config.tools], + "filesystem_mode": config.filesystem_mode, + "workspace_root": str(config.workspace_root) if config.workspace_root is not None else None, + "file_mounts": [ + { + "host_path": str(mount.host_path), + "mount_path": _display_mount_path(mount.mount_path), + } + for mount in config.file_mounts + ], + "network_mode": config.network_mode, + "allowed_domains": list(config.allowed_domains), + "allowed_http_methods": list(config.allowed_http_methods), + } + + def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: + self.__dict__["description"] = self.description + return super().to_dict(exclude=exclude, exclude_none=exclude_none) + + def _refresh_approval_mode(self) -> None: + self.approval_mode = _resolve_execute_code_approval_mode( + base_approval_mode=self._default_approval_mode, + tools=self._managed_tools, + ) + + def _build_run_config(self) -> _RunConfig: + with self._state_lock: + managed_tools = tuple(self._managed_tools) + workspace_root = self._workspace_root + stored_mounts = tuple(self._file_mounts.values()) + allowed_domains = tuple(sorted(self._allowed_domains)) + allowed_http_methods = tuple(sorted(self._allowed_http_methods)) + approval_mode = _resolve_execute_code_approval_mode( + base_approval_mode=self._default_approval_mode, + tools=managed_tools, + ) + + workspace_signature = _path_tree_signature(workspace_root) if workspace_root is not None else () + normalized_mounts = tuple( + _NormalizedFileMount( + host_path=mount.host_path, + mount_path=mount.mount_path, + path_signature=_path_tree_signature(mount.host_path), + ) + for mount in stored_mounts + ) + + return _RunConfig( + backend=self._backend, + module=self._module, + module_path=self._module_path, + approval_mode=approval_mode, + tools=managed_tools, + filesystem_mode=self._filesystem_mode, + workspace_root=workspace_root, + workspace_signature=workspace_signature, + file_mounts=normalized_mounts, + network_mode=self._network_mode, + allowed_domains=allowed_domains, + allowed_http_methods=allowed_http_methods, + ) + + def _run_code(self, *, code: str) -> list[Content]: + config = self._build_run_config() + return self._registry.execute(config=config, code=code) diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py b/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py new file mode 100644 index 0000000000..77e7993e95 --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +from collections.abc import Sequence + +from agent_framework import FunctionTool + +from ._types import FilesystemMode, NetworkMode + + +def _format_tool_summaries(tools: Sequence[FunctionTool]) -> str: + if not tools: + return "- No tools are currently registered inside the sandbox." + + lines: list[str] = [] + for tool_obj in tools: + parameters = tool_obj.parameters().get("properties", {}) + parameter_names = [name for name in parameters if isinstance(name, str)] + parameter_summary = ", ".join(parameter_names) if parameter_names else "none" + description = str(tool_obj.description or "").strip() or "No description provided." + lines.append(f"- `{tool_obj.name}`: {description} Parameters: {parameter_summary}.") + return "\n".join(lines) + + +def _format_filesystem_capabilities( + *, + filesystem_mode: FilesystemMode, + workspace_enabled: bool, + mounted_paths: Sequence[str], +) -> str: + if filesystem_mode == "none": + return "Filesystem access is disabled." + + lines = ["Filesystem access is enabled."] + lines.append("Read files from `/input`.") + if filesystem_mode == "read_write": + lines.append("Write generated artifacts to `/output`; returned files will be attached to the tool result.") + else: + lines.append("The sandbox does not expose a writable `/output` directory in this configuration.") + + if workspace_enabled: + lines.append("The configured workspace root is available under `/input/`.") + + if mounted_paths: + lines.append("Additional mounted paths:") + lines.extend(f"- `{mounted_path}`" for mounted_path in mounted_paths) + elif not workspace_enabled: + lines.append("No workspace root or explicit file mounts are currently configured.") + + return "\n".join(lines) + + +def _format_network_capabilities( + *, + network_mode: NetworkMode, + allowed_domains: Sequence[str], + allowed_http_methods: Sequence[str], +) -> str: + if network_mode == "none": + return "Outbound network access is disabled." + + methods_text = ", ".join(allowed_http_methods) if allowed_http_methods else "all methods allowed by the backend" + if not allowed_domains: + return "Outbound network access uses an allow-list, but no domains are currently configured." + + lines = [ + "Outbound network access uses an allow-list.", + f"Allowed HTTP methods: {methods_text}.", + "Allowed domains:", + ] + lines.extend(f"- `{domain}`" for domain in allowed_domains) + return "\n".join(lines) + + +def build_codeact_instructions( + *, + tools: Sequence[FunctionTool], + tools_visible_to_model: bool, + filesystem_mode: FilesystemMode, + workspace_enabled: bool, + mounted_paths: Sequence[str], + network_mode: NetworkMode, + allowed_domains: Sequence[str], + allowed_http_methods: Sequence[str], +) -> str: + """Build dynamic CodeAct instructions for the effective sandbox state.""" + usage_note = ( + "Some tools may also appear directly, but prefer `execute_code` whenever you need to combine Python " + "control flow with sandbox tool calls." + if tools_visible_to_model + else "Provider-owned sandbox tools are not exposed separately; use `execute_code` when you need them." + ) + + return f"""You have one primary tool: execute_code. + +Prefer one execute_code call per request when possible. +Its tool description contains the current `call_tool(...)` guidance, sandbox +tool registry, and capability limits. + +{usage_note} +""" + + +def build_execute_code_description( + *, + tools: Sequence[FunctionTool], + filesystem_mode: FilesystemMode, + workspace_enabled: bool, + mounted_paths: Sequence[str], + network_mode: NetworkMode, + allowed_domains: Sequence[str], + allowed_http_methods: Sequence[str], +) -> str: + """Build the dynamic execute_code tool description for standalone usage.""" + filesystem_text = _format_filesystem_capabilities( + filesystem_mode=filesystem_mode, + workspace_enabled=workspace_enabled, + mounted_paths=mounted_paths, + ) + network_text = _format_network_capabilities( + network_mode=network_mode, + allowed_domains=allowed_domains, + allowed_http_methods=allowed_http_methods, + ) + + return f"""Execute Python in an isolated Hyperlight sandbox. + +Inside the sandbox, `call_tool(name, **kwargs)` is available as a built-in for +registered host callbacks. Use the tool name as the first argument and keyword +arguments only. Do not pass a dict or any other positional arguments after the +tool name. + +Registered sandbox tools: +{_format_tool_summaries(tools)} + +Filesystem capabilities: +{filesystem_text} + +Network capabilities: +{network_text} + +Prefer `execute_code` when you need to combine one or more `call_tool(...)` +calls with Python control flow, loops, or post-processing. +""" diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_provider.py b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py new file mode 100644 index 0000000000..e6820dc440 --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py @@ -0,0 +1,133 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from pathlib import Path +from typing import Any + +from agent_framework import AgentSession, ContextProvider, FunctionTool, SessionContext +from agent_framework._tools import ApprovalMode + +from ._execute_code_tool import HyperlightExecuteCodeTool, SandboxRuntime +from ._types import FileMount, FilesystemMode, NetworkMode + + +class HyperlightCodeActProvider(ContextProvider): + """Inject a Hyperlight-backed CodeAct surface using provider-owned tools.""" + + DEFAULT_SOURCE_ID = "hyperlight_codeact" + + def __init__( + self, + source_id: str = DEFAULT_SOURCE_ID, + *, + tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]] | None = None, + approval_mode: ApprovalMode | None = None, + filesystem_mode: FilesystemMode = "none", + workspace_root: str | Path | None = None, + file_mounts: FileMount | Sequence[FileMount] | None = None, + network_mode: NetworkMode = "none", + allowed_domains: str | Sequence[str] | None = None, + allowed_http_methods: str | Sequence[str] | None = None, + backend: str = "wasm", + module: str | None = "python_guest.path", + module_path: str | None = None, + _registry: SandboxRuntime | None = None, + ) -> None: + super().__init__(source_id) + self._execute_code_tool = HyperlightExecuteCodeTool( + tools=tools, + approval_mode=approval_mode, + filesystem_mode=filesystem_mode, + workspace_root=workspace_root, + file_mounts=file_mounts, + network_mode=network_mode, + allowed_domains=allowed_domains, + allowed_http_methods=allowed_http_methods, + backend=backend, + module=module, + module_path=module_path, + _registry=_registry, + ) + + def add_tools( + self, + tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]], + ) -> None: + """Add provider-owned sandbox tools.""" + self._execute_code_tool.add_tools(tools) + + def get_tools(self) -> list[FunctionTool]: + """Return the provider-owned sandbox tools.""" + return self._execute_code_tool.get_tools() + + def remove_tool(self, name: str) -> None: + """Remove one provider-owned sandbox tool by name.""" + self._execute_code_tool.remove_tool(name) + + def clear_tools(self) -> None: + """Remove all provider-owned sandbox tools.""" + self._execute_code_tool.clear_tools() + + def add_file_mounts(self, file_mounts: FileMount | Sequence[FileMount]) -> None: + """Add provider-managed file mounts.""" + self._execute_code_tool.add_file_mounts(file_mounts) + + def get_file_mounts(self) -> list[FileMount]: + """Return the provider-managed file mounts.""" + return self._execute_code_tool.get_file_mounts() + + def remove_file_mount(self, mount_path: str) -> None: + """Remove one provider-managed file mount.""" + self._execute_code_tool.remove_file_mount(mount_path) + + def clear_file_mounts(self) -> None: + """Remove all provider-managed file mounts.""" + self._execute_code_tool.clear_file_mounts() + + def add_allowed_domains(self, domains: str | Sequence[str]) -> None: + """Add provider-managed outbound allow-list domains.""" + self._execute_code_tool.add_allowed_domains(domains) + + def get_allowed_domains(self) -> list[str]: + """Return the provider-managed outbound allow-list domains.""" + return self._execute_code_tool.get_allowed_domains() + + def remove_allowed_domain(self, domain: str) -> None: + """Remove one provider-managed outbound allow-list domain.""" + self._execute_code_tool.remove_allowed_domain(domain) + + def clear_allowed_domains(self) -> None: + """Remove all provider-managed outbound allow-list domains.""" + self._execute_code_tool.clear_allowed_domains() + + def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: + """Add provider-managed outbound HTTP methods.""" + self._execute_code_tool.add_allowed_http_methods(methods) + + def get_allowed_http_methods(self) -> list[str]: + """Return the provider-managed outbound HTTP methods.""" + return self._execute_code_tool.get_allowed_http_methods() + + def remove_allowed_http_method(self, method: str) -> None: + """Remove one provider-managed outbound HTTP method.""" + self._execute_code_tool.remove_allowed_http_method(method) + + def clear_allowed_http_methods(self) -> None: + """Remove all provider-managed outbound HTTP methods.""" + self._execute_code_tool.clear_allowed_http_methods() + + async def before_run( + self, + *, + agent: Any, + session: AgentSession | None, + context: SessionContext, + state: dict[str, Any], + ) -> None: + """Inject CodeAct instructions and a run-scoped execute_code tool before each run.""" + run_tool = self._execute_code_tool.create_run_tool() + state[self.source_id] = run_tool.build_serializable_state() + context.extend_instructions(self.source_id, run_tool.build_instructions(tools_visible_to_model=False)) + context.extend_tools(self.source_id, [run_tool]) diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_types.py b/python/packages/hyperlight/agent_framework_hyperlight/_types.py new file mode 100644 index 0000000000..91c363130e --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/_types.py @@ -0,0 +1,18 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +FilesystemMode = Literal["none", "read_only", "read_write"] +NetworkMode = Literal["none", "allow_list"] + + +@dataclass(frozen=True, slots=True) +class FileMount: + """Map a host file or directory into the sandbox input tree.""" + + host_path: str | Path + mount_path: str diff --git a/python/packages/hyperlight/pyproject.toml b/python/packages/hyperlight/pyproject.toml new file mode 100644 index 0000000000..1e0a75abc7 --- /dev/null +++ b/python/packages/hyperlight/pyproject.toml @@ -0,0 +1,102 @@ +[project] +name = "agent-framework-hyperlight" +description = "Hyperlight CodeAct integrations for Microsoft Agent Framework." +authors = [{ name = "Microsoft", email = "af-support@microsoft.com"}] +readme = "README.md" +requires-python = ">=3.10" +version = "1.0.0a260409" +license-files = ["LICENSE"] +urls.homepage = "https://aka.ms/agent-framework" +urls.source = "https://github.com/microsoft/agent-framework/tree/main/python" +urls.release_notes = "https://github.com/microsoft/agent-framework/releases?q=tag%3Apython-1&expanded=true" +urls.issues = "https://github.com/microsoft/agent-framework/issues" +classifiers = [ + "License :: OSI Approved :: MIT License", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Typing :: Typed", +] +dependencies = [ + "agent-framework-core>=1.0.0,<2", + "hyperlight-sandbox>=0.3.0,<0.4", + "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; sys_platform == 'linux' or sys_platform == 'win32'", + "hyperlight-sandbox-python-guest>=0.3.0,<0.4", +] + +[tool.uv] +prerelease = "if-necessary-or-explicit" +environments = [ + "sys_platform == 'linux'", + "sys_platform == 'win32'" +] + +[tool.uv-dynamic-versioning] +fallback-version = "0.0.0" + +[tool.pytest.ini_options] +testpaths = 'tests' +addopts = "-ra -q -r fEX" +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" +filterwarnings = [] +timeout = 120 +markers = [ + "integration: marks tests as integration tests that require external services", +] + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.per-file-ignores] +"samples/**" = ["INP", "T201"] +"tests/**" = ["D", "INP", "TD", "ERA001", "RUF", "S"] + +[tool.coverage.run] +omit = [ + "**/__init__.py" +] + +[tool.pyright] +extends = "../../pyproject.toml" +include = ["agent_framework_hyperlight"] +exclude = ['tests'] + +[tool.mypy] +plugins = ['pydantic.mypy'] +strict = true +python_version = "3.10" +ignore_missing_imports = true +disallow_untyped_defs = true +no_implicit_optional = true +check_untyped_defs = true +warn_return_any = true +show_error_codes = true +warn_unused_ignores = false +disallow_incomplete_defs = true +disallow_untyped_decorators = true + +[tool.bandit] +targets = ["agent_framework_hyperlight"] +exclude_dirs = ["tests", "samples"] + +[tool.poe] +executor.type = "uv" +include = "../../shared_tasks.toml" + +[tool.poe.tasks.mypy] +help = "Run MyPy for this package." +cmd = "mypy --config-file $POE_ROOT/pyproject.toml agent_framework_hyperlight" + +[tool.poe.tasks.test] +help = "Run the default unit test suite for this package." +cmd = 'pytest -m "not integration" --cov=agent_framework_hyperlight --cov-report=term-missing:skip-covered tests' + +[build-system] +requires = ["flit-core >= 3.11,<4.0"] +build-backend = "flit_core.buildapi" diff --git a/python/packages/hyperlight/samples/README.md b/python/packages/hyperlight/samples/README.md new file mode 100644 index 0000000000..18896c4aa3 --- /dev/null +++ b/python/packages/hyperlight/samples/README.md @@ -0,0 +1,16 @@ +# Hyperlight CodeAct samples + +These samples demonstrate the alpha `agent-framework-hyperlight` package. + +- `codeact_context_provider.py` shows the provider-owned CodeAct model where the + agent only sees `execute_code` and sandbox tools are owned by + `HyperlightCodeActProvider`. +- `codeact_tool.py` shows the standalone `HyperlightExecuteCodeTool` surface + where `execute_code` is added directly to the agent tool list. + +Run the samples from the repository after installing the workspace dependencies: + +```bash +uv run --directory packages/hyperlight python samples/codeact_context_provider.py +uv run --directory packages/hyperlight python samples/codeact_tool.py +``` diff --git a/python/packages/hyperlight/samples/codeact_context_provider.py b/python/packages/hyperlight/samples/codeact_context_provider.py new file mode 100644 index 0000000000..c0cc03c2f6 --- /dev/null +++ b/python/packages/hyperlight/samples/codeact_context_provider.py @@ -0,0 +1,192 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import asyncio +import logging +import os +from collections.abc import Awaitable, Callable +from typing import Annotated, Any, Literal + +from agent_framework import Agent, FunctionInvocationContext, function_middleware, tool +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +from agent_framework_hyperlight import HyperlightCodeActProvider + +"""This sample demonstrates the provider-owned Hyperlight CodeAct flow. + +The sample keeps `compute` and `fetch_data` off the direct agent tool surface and +registers them only with `HyperlightCodeActProvider`. The model therefore sees a +single `execute_code` tool and must call the provider-owned tools from inside +the sandbox with `call_tool(...)`. +""" + +load_dotenv() + +_CYAN = "\033[36m" +_YELLOW = "\033[33m" +_GREEN = "\033[32m" +_DIM = "\033[2m" +_RESET = "\033[0m" + + +class _ColoredFormatter(logging.Formatter): + """Dim logger output so it does not compete with sample prints.""" + + def format(self, record: logging.LogRecord) -> str: + return f"{_DIM}{super().format(record)}{_RESET}" + + +logging.basicConfig(level=logging.WARNING) +logging.getLogger().handlers[0].setFormatter( + _ColoredFormatter("[%(asctime)s] %(levelname)s: %(message)s"), +) + + +@function_middleware +async def log_function_calls( + context: FunctionInvocationContext, + call_next: Callable[[], Awaitable[None]], +) -> None: + """Log tool calls, including readable execute_code blocks.""" + import time + + function_name = context.function.name + arguments = context.arguments if isinstance(context.arguments, dict) else {} + + if function_name == "execute_code" and "code" in arguments: + print(f"\n{_YELLOW}{'─' * 60}") + print("▶ execute_code") + print(f"{'─' * 60}{_RESET}") + print(arguments["code"]) + print(f"{_YELLOW}{'─' * 60}{_RESET}") + else: + pairs = ", ".join(f"{name}={value!r}" for name, value in arguments.items()) + print(f"\n{_YELLOW}▶ {function_name}({pairs}){_RESET}") + + start = time.perf_counter() + await call_next() + elapsed = time.perf_counter() - start + + result = context.result + if function_name == "execute_code" and isinstance(result, list): + for item in result: + if item.type != "code_interpreter_tool_result": + continue + + for output in item.outputs or []: + if output.type == "text" and output.text: + print(f"{_GREEN}stdout:\n{output.text}{_RESET}") + if output.type == "error" and output.error_details: + print(f"{_YELLOW}stderr:\n{output.error_details}{_RESET}") + else: + print(f"{_YELLOW}◀ {function_name} → {result!r}{_RESET}") + + print(f"{_DIM} ({elapsed:.4f}s){_RESET}") + + +@tool(approval_mode="never_require") +def compute( + operation: Annotated[ + Literal["add", "subtract", "multiply", "divide"], + "Math operation: add, subtract, multiply, or divide.", + ], + a: Annotated[float, "First numeric operand."], + b: Annotated[float, "Second numeric operand."], +) -> float: + """Perform a math operation for sandboxed code.""" + operations = { + "add": a + b, + "subtract": a - b, + "multiply": a * b, + "divide": a / b if b else float("inf"), + } + return operations[operation] + + +@tool(approval_mode="never_require") +async def fetch_data( + table: Annotated[str, "Name of the simulated table to query."], +) -> list[dict[str, Any]]: + """Fetch records from a named table.""" + await asyncio.sleep(0.5) + data: dict[str, list[dict[str, Any]]] = { + "users": [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "admin"}, + ], + "products": [ + {"id": 101, "name": "Widget", "price": 9.99}, + {"id": 102, "name": "Gadget", "price": 19.99}, + ], + } + return data.get(table, []) + + +async def main() -> None: + """Run the provider-owned Hyperlight CodeAct sample.""" + # 1. Create the Hyperlight-backed provider and register sandbox tools on it. + codeact = HyperlightCodeActProvider( + tools=[compute, fetch_data], + approval_mode="never_require", + ) + + # 2. Create the client and the agent. + agent = Agent( + client=FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], + credential=AzureCliCredential(), + ), + name="HyperlightCodeActProviderAgent", + instructions="You are a helpful assistant.", + context_providers=[codeact], + middleware=[log_function_calls], + ) + + # 3. Run a request that should use execute_code plus provider-owned tools. + query = ( + "Fetch all users, find admins, multiply 7*(3*2), and print the users, " + "admins, and multiplication result. Use execute_code and call_tool(...) " + "inside the sandbox." + ) + print(f"{_CYAN}{'=' * 60}") + print("Hyperlight CodeAct provider sample") + print(f"{'=' * 60}{_RESET}") + print(f"{_CYAN}User: {query}{_RESET}") + result = await agent.run(query) + print(f"{_CYAN}Agent: {result.text}{_RESET}") + + +""" +Sample output (shape only): + +============================================================ +Hyperlight CodeAct provider sample +============================================================ +User: Fetch all users, find admins, multiply 7*(3*2), ... + +──────────────────────────────────────────────────────────── +▶ execute_code +──────────────────────────────────────────────────────────── +users = call_tool("fetch_data", table="users") +admins = [user for user in users if user["role"] == "admin"] +result = call_tool("compute", operation="multiply", a=7, b=6) +print("Users:", users) +print("Admins:", admins) +print("7 * 6 =", result) +──────────────────────────────────────────────────────────── +stdout: +Users: [...] +Admins: [...] +7 * 6 = 42.0 + (0.0xxx s) +Agent: ... +""" + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/packages/hyperlight/samples/codeact_tool.py b/python/packages/hyperlight/samples/codeact_tool.py new file mode 100644 index 0000000000..64c0e6fde5 --- /dev/null +++ b/python/packages/hyperlight/samples/codeact_tool.py @@ -0,0 +1,110 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import asyncio +import os +from typing import Annotated, Any, Literal + +from agent_framework import Agent, tool +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +from agent_framework_hyperlight import HyperlightExecuteCodeTool + +"""This sample demonstrates the standalone Hyperlight execute_code tool. + +The sample adds `HyperlightExecuteCodeTool` directly to the agent. The tool's +own description advertises `call_tool(...)`, the registered sandbox tools, and +the current capability configuration, so no extra CodeAct-specific agent +instructions are required. +""" + +load_dotenv() + + +@tool(approval_mode="never_require") +def compute( + operation: Annotated[ + Literal["add", "subtract", "multiply", "divide"], + "Math operation: add, subtract, multiply, or divide.", + ], + a: Annotated[float, "First numeric operand."], + b: Annotated[float, "Second numeric operand."], +) -> float: + """Perform a math operation used by sandboxed code.""" + operations = { + "add": a + b, + "subtract": a - b, + "multiply": a * b, + "divide": a / b if b else float("inf"), + } + return operations[operation] + + +@tool(approval_mode="never_require") +def fetch_data( + table: Annotated[str, "Name of the simulated table to query."], +) -> list[dict[str, Any]]: + """Fetch simulated records from a named table.""" + data: dict[str, list[dict[str, Any]]] = { + "users": [ + {"id": 1, "name": "Alice", "role": "admin"}, + {"id": 2, "name": "Bob", "role": "user"}, + {"id": 3, "name": "Charlie", "role": "admin"}, + ], + "products": [ + {"id": 101, "name": "Widget", "price": 9.99}, + {"id": 102, "name": "Gadget", "price": 19.99}, + ], + } + return data.get(table, []) + + +async def main() -> None: + """Run the standalone execute_code sample.""" + # 1. Create the packaged execute_code tool and register sandbox tools on it. + execute_code = HyperlightExecuteCodeTool( + tools=[compute, fetch_data], + approval_mode="never_require", + ) + + # 2. Create the client and the agent. + agent = Agent( + client=FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], + credential=AzureCliCredential(), + ), + name="HyperlightExecuteCodeToolAgent", + instructions="You are a helpful assistant.", + tools=execute_code, + ) + + # 3. Run one request through the direct-tool surface. + print("=" * 60) + print("Hyperlight execute_code tool sample") + print("=" * 60) + query = ( + "Fetch all users, find admins, multiply 6*7, and print the users, admins, " + "and multiplication result. Use one execute_code call." + ) + print(f"User: {query}") + result = await agent.run(query) + print(f"Agent: {result.text}") + + +""" +Sample output (shape only): + +============================================================ +Hyperlight execute_code tool sample +============================================================ +User: Fetch all users, find admins, multiply 6*7, ... +Agent: ... +""" + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py new file mode 100644 index 0000000000..335121150d --- /dev/null +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -0,0 +1,382 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import asyncio +import importlib.metadata +import importlib.util +import inspect +import json +import os +import sys +import threading +from collections.abc import Awaitable, Callable, Mapping, MutableSequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pytest +from agent_framework import ( + Agent, + BaseChatClient, + ChatResponse, + ChatResponseUpdate, + Content, + FunctionInvocationLayer, + Message, + ResponseStream, + tool, +) + +from agent_framework_hyperlight import FileMount, HyperlightCodeActProvider, HyperlightExecuteCodeTool +from agent_framework_hyperlight import _execute_code_tool as execute_code_module + + +def _hyperlight_integration_skip_reason() -> str | None: + enabled = os.getenv("RUN_HYPERLIGHT_INTEGRATION_TESTS", "").strip().lower() + if enabled not in {"1", "true", "yes"}: + return "Set RUN_HYPERLIGHT_INTEGRATION_TESTS=true to enable Hyperlight integration tests." + + if sys.platform not in {"linux", "win32"}: + return "Hyperlight integration tests require Linux or Windows runners." + + if importlib.util.find_spec("hyperlight_sandbox") is None: + return "hyperlight-sandbox is not installed." + + if importlib.util.find_spec("python_guest") is None: + return "hyperlight-sandbox-python-guest is not installed." + + try: + importlib.metadata.version("hyperlight-sandbox-backend-wasm") + except importlib.metadata.PackageNotFoundError: + return "hyperlight-sandbox-backend-wasm is not installed." + + return None + + +skip_if_hyperlight_integration_tests_disabled = pytest.mark.skipif( + (reason := _hyperlight_integration_skip_reason()) is not None, + reason=reason or "Hyperlight integration tests are disabled.", +) + + +@tool(approval_mode="never_require") +def compute(a: int, b: int) -> int: + return a + b + + +@tool(approval_mode="always_require") +def dangerous_compute(a: int, b: int) -> int: + return a * b + + +@dataclass(slots=True) +class _FakeResult: + success: bool + stdout: str = "" + stderr: str = "" + + +def _run_in_thread(callback: Callable[[], Any]) -> Any: + result: dict[str, Any] = {} + error: dict[str, BaseException] = {} + + def _runner() -> None: + try: + result["value"] = callback() + except BaseException as exc: + error["value"] = exc + + thread = threading.Thread(target=_runner) + thread.start() + thread.join() + + if "value" in error: + raise error["value"] + + return result.get("value") + + +class _FakeSandbox: + instances: list[_FakeSandbox] = [] + + def __init__( + self, + *, + input_dir: str | None = None, + output_dir: str | None = None, + temp_output: bool = False, + backend: str = "wasm", + module: str | None = None, + module_path: str | None = None, + heap_size: str | None = None, + stack_size: str | None = None, + ) -> None: + self.input_dir = input_dir + self.output_dir = output_dir + self.registered_tools: dict[str, Any] = {} + self.allowed_domains: list[tuple[str, list[str] | None]] = [] + self.restore_calls: list[Any] = [] + self.output_files: list[str] = [] + _FakeSandbox.instances.append(self) + + def register_tool(self, name_or_tool: Any, callback: Any | None = None) -> None: + if callback is None: + raise AssertionError("Expected callback registration for sandbox tools.") + self.registered_tools[str(name_or_tool)] = callback + + def allow_domain(self, target: str, methods: list[str] | None = None) -> None: + self.allowed_domains.append((target, methods)) + + def _invoke_tool(self, name: str, **kwargs: Any) -> Any: + callback = self.registered_tools[name] + if inspect.iscoroutinefunction(callback): + return _run_in_thread(lambda: asyncio.run(callback(**kwargs))) + + result = callback(**kwargs) + if inspect.isawaitable(result): + return _run_in_thread(lambda: asyncio.run(result)) + return result + + def run(self, code: str) -> _FakeResult: + if code == "None": + return _FakeResult(success=True) + if code == "create-output": + if self.output_dir is None: + raise AssertionError("Expected output directory for create-output test.") + Path(self.output_dir, "report.txt").write_text("artifact", encoding="utf-8") + self.output_files = ["report.txt"] + return _FakeResult(success=True, stdout="done\n") + if 'call_tool("compute", a=20, b=22)' in code: + total = self._invoke_tool("compute", a=20, b=22) + return _FakeResult(success=True, stdout=f"{total}\n") + return _FakeResult(success=False, stderr="sandbox boom") + + def snapshot(self) -> str: + return "snapshot" + + def restore(self, snapshot: Any) -> None: + self.restore_calls.append(snapshot) + + def get_output_files(self) -> list[str]: + return list(self.output_files) + + +class _FakeRuntime: + def __init__(self) -> None: + self.calls: list[tuple[Any, str]] = [] + + def execute(self, *, config: Any, code: str) -> list[Content]: + self.calls.append((config, code)) + return [Content.from_text("ok")] + + +class _FakeSessionContext: + def __init__(self, *, tools: list[Any] | None = None) -> None: + self.options: dict[str, Any] = {} + if tools is not None: + self.options["tools"] = tools + self.instructions: list[tuple[str, str]] = [] + self.tools: list[tuple[str, list[Any]]] = [] + + def extend_instructions(self, source_id: str, instructions: str) -> None: + self.instructions.append((source_id, instructions)) + + def extend_tools(self, source_id: str, tools: list[Any]) -> None: + self.tools.append((source_id, tools)) + + +class _FakeCodeActChatClient(FunctionInvocationLayer[Any], BaseChatClient[Any]): + def __init__(self) -> None: + FunctionInvocationLayer.__init__(self) + BaseChatClient.__init__(self) + self.call_count = 0 + + def _inner_get_response( + self, + *, + messages: MutableSequence[Message], + stream: bool, + options: Mapping[str, Any], + **kwargs: Any, + ) -> Awaitable[ChatResponse] | ResponseStream[ChatResponseUpdate, ChatResponse]: + if stream: + raise AssertionError("Streaming is not used in this integration test.") + + async def _get_response() -> ChatResponse: + self.call_count += 1 + + if self.call_count == 1: + return ChatResponse( + messages=Message( + role="assistant", + contents=[ + Content.from_function_call( + call_id="execute_code_call", + name="execute_code", + arguments={ + "code": 'total = call_tool("compute", a=20, b=22)\nprint(total)', + }, + ) + ], + ) + ) + + function_results = [ + content for message in messages for content in message.contents if content.type == "function_result" + ] + assert len(function_results) == 1 + + result_content = function_results[0] + assert result_content.call_id == "execute_code_call" + + code_result = next( + item for item in result_content.items or [] if item.type == "code_interpreter_tool_result" + ) + text_output = next(item for item in code_result.outputs or [] if item.type == "text") + assert text_output.text == "42\n" + assert result_content.exception is None + + return ChatResponse(messages=Message(role="assistant", contents=["The sandbox returned 42."])) + + return _get_response() + + +def test_execute_code_tool_updates_approval_with_managed_tools() -> None: + execute_code = HyperlightExecuteCodeTool(tools=[compute], _registry=_FakeRuntime()) + assert execute_code.approval_mode == "never_require" + + execute_code.add_tools([dangerous_compute]) + assert execute_code.approval_mode == "always_require" + + +def test_execute_code_tool_requires_enabled_capabilities(tmp_path: Path) -> None: + execute_code = HyperlightExecuteCodeTool(_registry=_FakeRuntime()) + mount = FileMount(host_path=tmp_path, mount_path="/input/data") + + with pytest.raises(ValueError, match="filesystem_mode"): + execute_code.add_file_mounts(mount) + + with pytest.raises(ValueError, match="network_mode"): + execute_code.add_allowed_domains("api.example.com") + + +def test_execute_code_tool_description_contains_call_tool_guidance(tmp_path: Path) -> None: + workspace_root = tmp_path / "workspace" + workspace_root.mkdir() + (workspace_root / "notes.txt").write_text("hello", encoding="utf-8") + mount_file = tmp_path / "data.json" + mount_file.write_text('{"hello": "world"}', encoding="utf-8") + + execute_code = HyperlightExecuteCodeTool( + tools=[compute], + filesystem_mode="read_write", + workspace_root=workspace_root, + file_mounts=[FileMount(host_path=mount_file, mount_path="/input/data/data.json")], + network_mode="allow_list", + allowed_domains=["https://api.example.com/v1"], + allowed_http_methods=["get"], + _registry=_FakeRuntime(), + ) + + description = execute_code.description + + assert "call_tool(name, **kwargs)" in description + assert "compute" in description + assert "/input/data/data.json" in description + assert "/output" in description + assert "api.example.com" in description + assert "GET" in description + + +async def test_execute_code_tool_executes_with_structured_content(monkeypatch: pytest.MonkeyPatch) -> None: + _FakeSandbox.instances.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _FakeSandbox) + + execute_code = HyperlightExecuteCodeTool( + tools=[compute], + filesystem_mode="read_write", + network_mode="allow_list", + allowed_domains=["api.example.com"], + allowed_http_methods=["get"], + ) + + result = await execute_code.invoke(arguments={"code": "create-output"}) + + assert result[0].type == "code_interpreter_tool_result" + assert result[0].outputs is not None + assert result[0].outputs[0].type == "text" + assert result[0].outputs[0].text == "done\n" + assert any(item.type == "data" for item in result[0].outputs) + assert _FakeSandbox.instances[0].allowed_domains == [("api.example.com", ["GET"])] + assert "compute" in _FakeSandbox.instances[0].registered_tools + + +async def test_execute_code_tool_failure_returns_error_content(monkeypatch: pytest.MonkeyPatch) -> None: + _FakeSandbox.instances.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _FakeSandbox) + + execute_code = HyperlightExecuteCodeTool() + result = await execute_code.invoke(arguments={"code": "fail"}) + + assert result[0].type == "code_interpreter_tool_result" + assert result[0].outputs is not None + assert result[0].outputs[0].type == "error" + assert result[0].outputs[0].error_details == "sandbox boom" + + +async def test_provider_injects_run_scoped_execute_code_tool() -> None: + runtime = _FakeRuntime() + provider = HyperlightCodeActProvider(tools=[compute], _registry=runtime) + context = _FakeSessionContext(tools=[dangerous_compute]) + state: dict[str, Any] = {} + + await provider.before_run(agent=object(), session=None, context=context, state=state) + + assert context.options["tools"] == [dangerous_compute] + assert len(context.instructions) == 1 + assert len(context.tools) == 1 + + run_tool = context.tools[0][1][0] + assert isinstance(run_tool, HyperlightExecuteCodeTool) + assert run_tool.approval_mode == "never_require" + assert [tool_obj.name for tool_obj in run_tool.get_tools()] == ["compute"] + assert "dangerous_compute" not in context.instructions[0][1] + assert "compute" not in context.instructions[0][1] + assert "Filesystem capabilities:" not in context.instructions[0][1] + assert state[provider.source_id]["tool_names"] == ["compute"] + assert state[provider.source_id]["approval_mode"] == "never_require" + json.dumps(state) + + provider.remove_tool("compute") + assert [tool_obj.name for tool_obj in run_tool.get_tools()] == ["compute"] + + +async def test_agent_runs_hyperlight_codeact_end_to_end_with_fake_sandbox(monkeypatch: pytest.MonkeyPatch) -> None: + _FakeSandbox.instances.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _FakeSandbox) + + client = _FakeCodeActChatClient() + provider = HyperlightCodeActProvider(tools=[compute]) + agent = Agent(client=client, context_providers=[provider]) + + response = await agent.run("Use the sandbox to add 20 and 22.") + + assert response.text == "The sandbox returned 42." + assert client.call_count == 2 + assert len(_FakeSandbox.instances) == 1 + assert "compute" in _FakeSandbox.instances[0].registered_tools + + +@pytest.mark.flaky +@pytest.mark.integration +@skip_if_hyperlight_integration_tests_disabled +async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> None: + client = _FakeCodeActChatClient() + provider = HyperlightCodeActProvider(tools=[compute]) + agent = Agent(client=client, context_providers=[provider]) + + response = await agent.run("Use the sandbox to add 20 and 22.") + + assert response.text == "The sandbox returned 42." + assert client.call_count == 2 diff --git a/python/pyproject.toml b/python/pyproject.toml index 92b37b57c7..4cd9dd37e8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -79,6 +79,7 @@ agent-framework-devui = { workspace = true } agent-framework-durabletask = { workspace = true } agent-framework-foundry = { workspace = true } agent-framework-foundry-local = { workspace = true } +agent-framework-hyperlight = { workspace = true } agent-framework-lab = { workspace = true } agent-framework-mem0 = { workspace = true } agent-framework-ollama = { workspace = true } diff --git a/python/samples/02-agents/tools/codeact_context_provider.py b/python/samples/02-agents/tools/codeact_context_provider.py deleted file mode 100644 index b12527afe7..0000000000 --- a/python/samples/02-agents/tools/codeact_context_provider.py +++ /dev/null @@ -1,601 +0,0 @@ -# /// script -# requires-python = ">=3.12,<3.13" -# dependencies = [ -# "hyperlight-sandbox", -# "hyperlight-sandbox-backend-wasm", -# "hyperlight-sandbox-python-guest", -# ] -# /// -# Run with: uv run --python 3.12 samples/02-agents/tools/codeact_context_provider.py -# -# Copyright (c) Microsoft. All rights reserved. - -from __future__ import annotations - -import asyncio -import json -import logging -import os -from collections.abc import Awaitable, Callable, Sequence -from textwrap import indent -from typing import Annotated, Any, Literal - -from agent_framework import ( - Agent, - AgentSession, - Content, - ContextProvider, - FunctionInvocationContext, - FunctionTool, - SessionContext, - function_middleware, - tool, -) -from agent_framework._tools import normalize_tools -from agent_framework.foundry import FoundryChatClient -from azure.identity import AzureCliCredential -from dotenv import load_dotenv - -try: - from hyperlight_sandbox import Sandbox -except ModuleNotFoundError as exc: - raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " - "implementation. Install the provisional Hyperlight package once it " - "is available, or update this sample to match the final import path." - ) from exc - -load_dotenv() - -# ANSI color helpers for distinguishing output sources. -_CYAN = "\033[36m" -_YELLOW = "\033[33m" -_GREEN = "\033[32m" -_DIM = "\033[2m" -_RESET = "\033[0m" - - -class _ColoredFormatter(logging.Formatter): - """Dim logger output so it doesn't compete with middleware and main prints.""" - - def format(self, record: logging.LogRecord) -> str: - msg = super().format(record) - return f"{_DIM}{msg}{_RESET}" - - -logging.basicConfig(level=logging.WARNING) -logging.getLogger().handlers[0].setFormatter( - _ColoredFormatter("[%(asctime)s] %(levelname)s: %(message)s"), -) -logger = logging.getLogger(__name__) - - -"""This sample demonstrates a ContextProvider-driven Hyperlight CodeAct prototype. - -The provider owns sandbox lifecycle and the tools registered within it. -Tools are passed directly to the provider — not the agent — so the model -only sees the single ``execute_code`` tool. - -A logging function middleware is registered on the agent to show every tool -invocation (name, arguments, timing, and result) in the console output. - -Per-run tools passed to ``agent.run(..., tools=...)`` are also captured by the -provider, registered with the sandbox, and removed from the model-facing tool -list. -""" - - -def _passthrough_result_parser(result: Any) -> str: - """Return a Python repr so sandbox code sees native-looking values. - - Using ``repr`` instead of ``json.dumps`` ensures the text can be - round-tripped back to a native Python value with ``ast.literal_eval``. - """ - return repr(result) - - -def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: - """Wrap a tool's ``invoke`` so ``call_tool`` returns native Python values. - - ``invoke()`` always returns ``list[Content]``. This wrapper extracts - the text, parses it back with ``ast.literal_eval``, and returns a - single value (not a list) when there is exactly one result item. - """ - - async def _callback(**kwargs: Any) -> Any: - import ast - - contents = await tool_obj.invoke(**kwargs) - values: list[Any] = [] - for c in contents: - if c.text is not None: - try: - values.append(ast.literal_eval(c.text)) - except (ValueError, SyntaxError): - values.append(c.text) - if len(values) == 1: - return values[0] - return values - - return _callback - - -def collect_tools(*tool_groups: Any) -> list[FunctionTool]: - """Normalize and collect unique ``FunctionTool`` instances, excluding execute_code.""" - - tools: list[FunctionTool] = [] - seen_names: set[str] = set() - - for tool_group in tool_groups: - normalized_group: Sequence[Any] - if ( - isinstance(tool_group, Sequence) - and not isinstance(tool_group, (str, bytes, bytearray)) - and all(isinstance(tool_obj, FunctionTool) for tool_obj in tool_group) - ): - normalized_group = tool_group - else: - normalized_group = normalize_tools(tool_group) - - for tool_obj in normalized_group: - if not isinstance(tool_obj, FunctionTool): - continue - - name = tool_obj.name - if name == "execute_code" or name in seen_names: - continue - - seen_names.add(name) - tools.append(tool_obj) - - return tools - - -def _resolve_execute_code_approval_mode( - *, - base_approval_mode: Literal["always_require", "never_require"] | None, - tools: Sequence[FunctionTool], -) -> Literal["always_require", "never_require"]: - """Return the strictest approval mode needed for execute_code.""" - - if base_approval_mode == "always_require": - return "always_require" - - if any(tool_obj.approval_mode == "always_require" for tool_obj in tools): - return "always_require" - - return "never_require" - - -def _tool_signature(tools: Sequence[FunctionTool]) -> tuple[tuple[str, int], ...]: - """Build a stable signature for a normalized tool sequence.""" - - return tuple((tool_obj.name, id(tool_obj)) for tool_obj in tools) - - -def _build_codeact_instructions( - *, - tools: Sequence[FunctionTool], - tools_visible_to_model: bool, -) -> str: - """Build dynamic CodeAct instructions for the discovered tools.""" - - if tools: - tools_descriptions = "\n\n".join([ - f"- `{tool_obj.name}`\n" - f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" - " Parameters:\n" - f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" - for tool_obj in tools - ]) - else: - tools_descriptions = "- No tools are currently registered inside the sandbox." - - visibility_note = ( - "Some tools listed below may also appear as normal tools, but you should still prefer " - "execute_code and call them from inside the sandbox. Only if you want to run just that single tool " - "can you use it directly." - if tools_visible_to_model - else "The tools listed below are registered inside the sandbox even if they do not appear as " - "normal tools. Access them through execute_code with call_tool(...)." - ) - - return f"""You have one primary tool: execute_code. - -It runs Python in an isolated sandbox. You do NOT have direct -access to data. The ONLY way to fetch data or perform computations is by -writing Python code via execute_code that calls `call_tool()` inside the -sandbox. - -`call_tool` is a built-in global inside the sandbox. No import is needed. - -CRITICAL: call_tool takes the tool name as first argument, then KEYWORD -arguments only. Never pass a dict as a positional argument. - -{visibility_note} - -Available sandbox tools: -{tools_descriptions} - -Correct examples: - result = call_tool("tool_name", keyword=value) - data = call_tool("fetch_data", table="users") - x = call_tool("compute", operation="multiply", a=3, b=7) - -WRONG — these will fail: - call_tool("tool_name", {{"keyword": "value"}}) # dict as positional arg - call_tool("tool_name", "value") # positional arg - -call_tool returns native Python values (int, float, str, list, dict), -so you can use results directly in subsequent code: - data = call_tool("fetch_data", table="users") - total = call_tool("compute", operation="add", a=data[0]["price"], b=data[1]["price"]) - -Prefer one execute_code call per request when possible. -Do NOT hardcode data that should come from call_tool(...). -""" - - -class CodeActContextProvider(ContextProvider): - """Inject a CodeAct surface using provider-owned tools. - - Tools passed to the provider are registered with the sandbox and made - available to the model exclusively through ``execute_code``. They are - never added to the model-facing tool list — only ``execute_code`` is. - - Per-run tools passed to ``agent.run(..., tools=...)`` are captured from - ``context.options["tools"]``, registered with the sandbox for the - duration of the run, and removed from the model-facing run options. - """ - - DEFAULT_SOURCE_ID = "codeact_provider" - - def __init__( - self, - source_id: str = DEFAULT_SOURCE_ID, - *, - tools: Sequence[FunctionTool] | None = None, - approval_mode: Literal["always_require", "never_require"] | None = None, - ) -> None: - """Initialize the provider. - - Args: - source_id: Unique provider source identifier. - - Keyword Args: - tools: Sandbox-managed tools owned by the provider. - These are available through ``call_tool(...)`` inside - ``execute_code`` and are never surfaced to the model as - separate tools. - approval_mode: Base approval mode for the provider-managed - `execute_code` tool. The effective mode is upgraded to the - strictest mode required by the managed tools for each run. - Default is evaluated as `never_require`. - """ - - super().__init__(source_id) - self._provider_tools = collect_tools(tools) - for t in self._provider_tools: - t.result_parser = _passthrough_result_parser - self._approval_mode = approval_mode - self._managed_tools: list[FunctionTool] = [] - self._base_signature: tuple[tuple[str, int], ...] = () - self._runtime_signature: tuple[tuple[str, int], ...] = () - self._module_path = "python_guest.path" - self._base_sandbox: Sandbox | None = None - self._base_snapshot: Any = None - self._runtime_sandbox: Sandbox | None = None - self._runtime_snapshot: Any = None - self._sandbox: Sandbox | None = None - self._snapshot: Any = None - - self._execute_code_tool = FunctionTool( - name="execute_code", - description=( - "Python code to execute in an isolated sandbox. " - "Use call_tool(...) inside the code to access other tools." - ), - func=self._run_code, - input_model={ - "type": "object", - "properties": { - "code": { - "type": "string", - "description": ( - "Python code to execute in an isolated sandbox. " - "Use call_tool(...) inside the code to access other tools." - ), - } - }, - "required": ["code"], - }, - approval_mode=self._approval_mode, - ) - - @staticmethod - def _build_sandbox_and_snapshot(*, tools: Sequence[FunctionTool], module_path: str) -> tuple[Sandbox, Any]: - """Build a sandbox and clean snapshot for the given tool set.""" - sandbox = Sandbox(backend="wasm", module_path=module_path) - - for tool_obj in tools: - sandbox.register_tool(tool_obj.name, _make_sandbox_callback(tool_obj)) - - sandbox.run("None") - snapshot = sandbox.snapshot() - - logger.debug("Sandbox initialized and snapshotted.") - return sandbox, snapshot - - def _initialize_sandbox( - self, - *, - base_tools: Sequence[FunctionTool], - runtime_tools: Sequence[FunctionTool], - ) -> None: - """Initialize or reuse the appropriate base/runtime sandbox snapshot.""" - - managed_tools = collect_tools(base_tools, runtime_tools) - - base_signature = _tool_signature(base_tools) - if base_signature != self._base_signature: - self._base_signature = base_signature - self._base_sandbox = None - self._base_snapshot = None - self._runtime_signature = () - self._runtime_sandbox = None - self._runtime_snapshot = None - - if self._base_snapshot is None or self._base_sandbox is None: - self._base_sandbox, self._base_snapshot = self._build_sandbox_and_snapshot( - tools=base_tools, module_path=self._module_path - ) - - if not runtime_tools: - self._sandbox = self._base_sandbox - self._snapshot = self._base_snapshot - self._managed_tools = managed_tools - - runtime_signature = _tool_signature(runtime_tools) - if runtime_signature != self._runtime_signature: - self._runtime_signature = runtime_signature - self._runtime_sandbox = None - self._runtime_snapshot = None - - if self._runtime_snapshot is None or self._runtime_sandbox is None: - # TODO: Derive runtime snapshots from the restored base snapshot once - # the provisional Hyperlight API makes incremental tool layering practical. - self._runtime_sandbox, self._runtime_snapshot = self._build_sandbox_and_snapshot( - tools=managed_tools, module_path=self._module_path - ) - - self._sandbox = self._runtime_sandbox - self._snapshot = self._runtime_snapshot - self._managed_tools = managed_tools - - def _run_code(self, *, code: str) -> list[Content]: - """Restore the sandbox and execute one block of Python code.""" - - if self._sandbox is None or self._snapshot is None: - raise RuntimeError("Sandbox has not been initialized yet.") - - self._sandbox.restore(self._snapshot) - result = self._sandbox.run(code=code) - - if result.success: - logger.debug("execute_code completed.") - contents: list[Content] = [] - if result.stdout: - contents.append(Content.from_text(result.stdout.strip())) - if result.stderr: - contents.append( - Content.from_text( - f"stderr:\n{result.stderr.strip()}", - additional_properties={"stream": "stderr"}, - ) - ) - return contents or [Content.from_text("Code executed successfully without output.")] - - logger.debug("execute_code failed.") - error_details = result.stderr.strip() if result.stderr else "Unknown sandbox error" - return [ - Content.from_text(f"Execution error:\n{error_details}"), - Content.from_error(message="Execution error", error_details=error_details), - ] - - async def before_run( - self, - *, - agent: Any, - session: AgentSession | None, - context: SessionContext, - state: dict[str, Any], - ) -> None: # noqa: ARG002 - """Inject CodeAct instructions and the execute_code tool before each run.""" - - # Capture and remove per-run tools so they are only available in the sandbox. - runtime_tools = collect_tools(context.options.pop("tools", None)) - for t in runtime_tools: - t.result_parser = _passthrough_result_parser - self._initialize_sandbox( - base_tools=self._provider_tools, - runtime_tools=runtime_tools, - ) - self._execute_code_tool.approval_mode = _resolve_execute_code_approval_mode( - base_approval_mode=self._approval_mode, - tools=self._managed_tools, - ) - - context.extend_instructions( - self.source_id, - _build_codeact_instructions( - tools=self._managed_tools, - tools_visible_to_model=False, - ), - ) - context.extend_tools(self.source_id, [self._execute_code_tool]) - - -# 1. Define a logging function middleware to observe tool invocations. -@function_middleware -async def log_function_calls( - context: FunctionInvocationContext, - call_next: Callable[[], Awaitable[None]], -) -> None: - """Log every tool call with readable code output and timing.""" - import time - - func_name = context.function.name - args = context.arguments if isinstance(context.arguments, dict) else {} - - # For execute_code, print the generated code as a readable block. - if func_name == "execute_code" and "code" in args: - print(f"\n{_YELLOW}{'─' * 60}") - print("▶ execute_code") - print(f"{'─' * 60}{_RESET}") - print(args["code"]) - print(f"{_YELLOW}{'─' * 60}{_RESET}") - else: - print(f"\n{_YELLOW}▶ {func_name}({', '.join(f'{k}={v!r}' for k, v in args.items())}){_RESET}") - - start = time.perf_counter() - await call_next() - elapsed = time.perf_counter() - start - - # Show the result concisely — full stdout for execute_code, repr for others. - result = context.result - if func_name == "execute_code" and isinstance(result, list): - for item in result: - text = getattr(item, "text", None) - if text: - print(f"{_GREEN}stdout:\n{text}{_RESET}") - else: - print(f"{_YELLOW}◀ {func_name} → {result!r}{_RESET}") - - print(f"{_DIM} ({elapsed:.4f}s){_RESET}") - - -@tool(approval_mode="never_require") -def compute( - operation: Annotated[ - Literal["add", "subtract", "multiply", "divide"], "Math operation: add, subtract, multiply, or divide." - ], - a: Annotated[float, "First numeric operand."], - b: Annotated[float, "Second numeric operand."], -) -> float: - """Perform a math operation, use this function instead of raw code, because it is safer.""" - - logger.warning("compute called with operation=%r, a=%r, b=%r", operation, a, b) - - operations = { - "add": a + b, - "subtract": a - b, - "multiply": a * b, - "divide": a / b if b else float("inf"), - } - return operations.get(operation, 0.0) - - -@tool(approval_mode="never_require") -async def fetch_data( - table: Annotated[str, "Name of the simulated table to query."], -) -> list[dict[str, Any]]: - """Fetch records from a named table. - - There are two tables, with the columns shown below: - - users: id, name, role - - products: id, name, price - """ - - logger.warning("fetch_data called with table=%r", table) - - await asyncio.sleep(0.5) # Simulate some latency - - return { - "users": [ - {"id": 1, "name": "Alice", "role": "admin"}, - {"id": 2, "name": "Bob", "role": "user"}, - {"id": 3, "name": "Charlie", "role": "admin"}, - ], - "products": [ - {"id": 101, "name": "Widget", "price": 9.99}, - {"id": 102, "name": "Gadget", "price": 19.99}, - ], - }.get(table, []) - - -async def main() -> None: - """Run the provider-managed CodeAct sample.""" - - # Tools are passed to the provider (not the agent) so they are only - # available inside the sandbox via call_tool(...) and never appear as - # separate model-facing tools. - agent = Agent( - client=FoundryChatClient( - project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], - model=os.environ["FOUNDRY_MODEL"], - credential=AzureCliCredential(), - ), - name="CodeActProviderAgent", - instructions="You are a helpful assistant.", - context_providers=[ - CodeActContextProvider(tools=[compute, fetch_data], approval_mode="never_require"), - ], - middleware=[log_function_calls], - ) - - print(f"{_CYAN}{'=' * 60}") - print("CodeAct ContextProvider sample") - print(f"{'=' * 60}{_RESET}") - query = ( - "Fetch all users, find admins, multiply 7*(3*2), and print the users, admins, " - "and multiplication result. Use the execute_code call, and try to do as much as possible inside the sandbox with call_tool(...) instead of in raw code outside." - ) - print(f"{_CYAN}User: {query}{_RESET}") - result = await agent.run(query) - print(f"{_CYAN}Agent: {result.text}{_RESET}") - - -""" -Sample output (shape only): - -============================================================ -CodeAct ContextProvider sample -============================================================ -User: Fetch all users, find admins, multiply 6*7, ... - -──────────────────────────────────────────────────────────── -▶ execute_code -──────────────────────────────────────────────────────────── -users = call_tool("fetch_data", table="users") -admins = [u for u in users if u["role"] == "admin"] -result = call_tool("compute", operation="multiply", a=6, b=7) -print("Users:", users) -print("Admins:", admins) -print("6 * 7 =", result) -──────────────────────────────────────────────────────────── -stdout: -Users: [...] -Admins: [...] -6 * 7 = 42.0 - (0.0452s) -Agent: ... - -Notes: -- Tools are passed to `CodeActContextProvider(tools=[...])`, NOT to the agent. - This ensures they are only available inside the sandbox via `call_tool(...)`. - The model only sees the `execute_code` tool. -- The logging middleware prints the model-generated code as a readable block - and shows its stdout, so you can trace exactly what the agent does. -- Set `approval_mode` on `CodeActContextProvider(...)` to control the approval - behavior of the provider-managed `execute_code` tool. -- Pass tools to `agent.run(..., tools=runtime_tools)` to expose them as per-run - sandbox tools. The provider captures them from `context.options["tools"]`, - registers them with the sandbox, and removes them from the model-facing run - options. -- This sample prioritizes the intended API shape over confirmed Hyperlight - runtime integration. -""" - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/python/samples/02-agents/tools/codeact_tool.py b/python/samples/02-agents/tools/codeact_tool.py deleted file mode 100644 index 5f48a82f19..0000000000 --- a/python/samples/02-agents/tools/codeact_tool.py +++ /dev/null @@ -1,335 +0,0 @@ -# /// script -# requires-python = ">=3.12,<3.13" -# dependencies = [ -# "hyperlight-sandbox", -# "hyperlight-sandbox-backend-wasm", -# "hyperlight-sandbox-python-guest", -# ] -# /// -# Run with: uv run --python 3.12 samples/02-agents/tools/codeact_tool.py -# -# Copyright (c) Microsoft. All rights reserved. - -from __future__ import annotations - -import asyncio -import json -import logging -import os -from collections.abc import Sequence -from textwrap import indent -from typing import Annotated, Any - -from agent_framework import Agent, Content, FunctionTool, tool -from agent_framework._tools import normalize_tools -from agent_framework.foundry import FoundryChatClient -from azure.identity import AzureCliCredential -from dotenv import load_dotenv - -try: - from hyperlight_sandbox import Sandbox -except ModuleNotFoundError as exc: - raise RuntimeError( - "This prototype expects an upstream `hyperlight_sandbox.Sandbox` " - "implementation. Install the provisional Hyperlight package once it " - "is available, or update this sample to match the final import path." - ) from exc - -load_dotenv() - -logger = logging.getLogger(__name__) - -"""This sample demonstrates a direct-tool Hyperlight CodeAct prototype. - -The sample creates an `Agent(client=FoundryChatClient(...), ...)` with a -primary `execute_code` tool plus schema-visible tools. It also supports -per-run runtime tools by registering them with the sandbox before the run and -passing them through `agent.run(..., tools=runtime_tools)`. -""" - -DEFAULT_PROMPT = ( - "Fetch all users, find admins, multiply 6*7, and print the users, admins, " - "and multiplication result. Use one execute_code call." -) - -_SIMULATED_DATA: dict[str, list[dict[str, Any]]] = { - "users": [ - {"id": 1, "name": "Alice", "role": "admin"}, - {"id": 2, "name": "Bob", "role": "user"}, - {"id": 3, "name": "Charlie", "role": "admin"}, - ], - "products": [ - {"id": 101, "name": "Widget", "price": 9.99}, - {"id": 102, "name": "Gadget", "price": 19.99}, - ], -} - -DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" - - -def collect_tools(*tool_groups: Any) -> list[FunctionTool]: - """Normalize and collect unique ``FunctionTool`` instances, excluding execute_code.""" - - tools: list[FunctionTool] = [] - seen_names: set[str] = set() - - for tool_group in tool_groups: - normalized_group: Sequence[Any] - if ( - isinstance(tool_group, Sequence) - and not isinstance(tool_group, (str, bytes, bytearray)) - and all(isinstance(tool_obj, FunctionTool) for tool_obj in tool_group) - ): - normalized_group = tool_group - else: - normalized_group = normalize_tools(tool_group) - - for tool_obj in normalized_group: - if not isinstance(tool_obj, FunctionTool): - continue - - name = tool_obj.name - if name == "execute_code" or name in seen_names: - continue - - seen_names.add(name) - tools.append(tool_obj) - - return tools - - -def build_codeact_instructions( - *, - tools: Sequence[FunctionTool], - tools_visible_to_model: bool, -) -> str: - """Build dynamic CodeAct instructions for the discovered tools.""" - - if tools: - callback_lines = "\n\n".join([ - f"- `{tool_obj.name}`\n" - f" Description: {str(tool_obj.description or '').strip() or 'No description provided.'}\n" - " Parameters:\n" - f"{indent(json.dumps(tool_obj.parameters(), indent=2, sort_keys=True), ' ')}" - for tool_obj in tools - ]) - else: - callback_lines = "- No tools are currently registered inside the sandbox." - - visibility_note = ( - "The tools listed below may also appear as normal tools, but you should still prefer " - "execute_code and call them from inside the sandbox." - if tools_visible_to_model - else "The tools listed below are registered inside the sandbox even if they do not appear as " - "normal tools. Access them through execute_code with call_tool(...)." - ) - - return f"""You have one primary tool: execute_code. - -It runs Python in an isolated Hyperlight Wasm sandbox. You do NOT have direct -access to data. The ONLY way to fetch data or perform computations is by -writing Python code via execute_code that calls `call_tool()` inside the -sandbox. - -`call_tool` is a built-in global inside the sandbox. No import is needed. - -{visibility_note} - -Available sandbox tools: -{callback_lines} - -Correct usage: -result = call_tool("tool_name", keyword=value) - -You can combine multiple call_tool(...) calls with regular Python code in the -same execute_code block, including loops, conditionals, variables, and -post-processing of tool results. - -Wrong usage: -call_tool("tool_name", {{"keyword": "value"}}) - -Do NOT hardcode data that should come from call_tool(...). -Prefer one execute_code call per request when possible. -Always include the complete stdout from execute_code in your final answer. -""" - - -def _create_wasm_sandbox(*, module_ref: str) -> Sandbox: - """Create the provisional Hyperlight Wasm sandbox instance.""" - - return Sandbox(backend="wasm", module=module_ref) - - -class CodeActSandboxManager: - """Manage the provisional Hyperlight sandbox lifecycle for this sample.""" - - def __init__(self, *, module_ref: str | None = None) -> None: - """Initialize the sandbox manager.""" - - self._module_ref = module_ref or os.environ.get("HYPERLIGHT_MODULE", DEFAULT_HYPERLIGHT_MODULE) - self._tools: list[FunctionTool] = [] - self._callback_signature: tuple[tuple[str, int], ...] = () - self._sandbox: Any = None - self._snapshot: Any = None - - def set_tools(self, tools: Sequence[FunctionTool]) -> None: - """Set the tools that should be registered with the sandbox.""" - - signature = tuple((tool_obj.name, id(tool_obj)) for tool_obj in tools) - if signature == self._callback_signature: - return - - self._tools = list(tools) - self._callback_signature = signature - self._sandbox = None - self._snapshot = None - - def initialize(self) -> None: - """Initialize the sandbox once and capture a reusable clean snapshot.""" - - if self._sandbox is not None and self._snapshot is not None: - return - - self._sandbox = _create_wasm_sandbox(module_ref=self._module_ref) - - for tool_obj in self._tools: - self._sandbox.register_tool(tool_obj.name, tool_obj.invoke) - - self._sandbox.run("None") - self._snapshot = self._sandbox.snapshot() - - logger.debug("Sandbox initialized and snapshotted.") - - def run_code(self, *, code: str) -> list[Content]: - """Restore the sandbox and execute one block of Python code.""" - - if self._sandbox is None or self._snapshot is None: - raise RuntimeError("Sandbox has not been initialized yet.") - - logger.debug("--- Model generated code ---\n%s\n--- end ---\n", code) - - self._sandbox.restore(self._snapshot) - result = self._sandbox.run(code=code) - - success = bool(getattr(result, "success", False)) - stdout = str(getattr(result, "stdout", "") or "").replace("\r\n", "\n") - stderr = str(getattr(result, "stderr", "") or "") - - if success: - logger.debug("execute_code completed.") - contents: list[Content] = [] - if stdout: - contents.append(Content.from_text(stdout)) - if stderr: - contents.append( - Content.from_text( - f"stderr:\n{stderr}", - additional_properties={"stream": "stderr"}, - ) - ) - return contents or [Content.from_text("Code executed successfully without output.")] - - logger.debug("execute_code failed.") - error_details = stderr or "Unknown sandbox error" - return [ - Content.from_text(f"Execution error:\n{error_details}"), - Content.from_error(message="Execution error", error_details=error_details), - ] - - -@tool(approval_mode="never_require") -def compute( - operation: Annotated[str, "Math operation: add, subtract, multiply, or divide."], - a: Annotated[float, "First numeric operand."], - b: Annotated[float, "Second numeric operand."], -) -> float: - """Perform a math operation used by sandbox code.""" - - operations = { - "add": a + b, - "subtract": a - b, - "multiply": a * b, - "divide": a / b if b else float("inf"), - } - return operations.get(operation, 0.0) - - -@tool(approval_mode="never_require") -def fetch_data( - table: Annotated[str, "Name of the simulated table to query."], -) -> list[dict[str, Any]]: - """Fetch simulated records from a named table.""" - - return _SIMULATED_DATA.get(table, []) - - -async def main() -> None: - """Run the direct-tool CodeAct sample.""" - - runtime_tools: list[Any] = [] - sandbox_manager = CodeActSandboxManager() - - @tool(name="execute_code", approval_mode="never_require") - async def execute_code( - code: Annotated[ - str, - ( - "Python code to execute in an isolated Hyperlight Wasm sandbox. " - "Use call_tool(...) inside the code to access registered host callbacks." - ), - ], - ) -> list[Content]: - """Execute code inside the provisional sandbox wrapper.""" - - return sandbox_manager.run_code(code=code) - - agent = Agent( - client=FoundryChatClient( - project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], - model=os.environ["FOUNDRY_MODEL"], - credential=AzureCliCredential(), - ), - name="HyperlightCodeActToolAgent", - instructions="Temporary instructions replaced before the run.", - tools=[execute_code, compute, fetch_data], - ) - - tools = collect_tools(agent.default_options.get("tools", []), runtime_tools) - sandbox_manager.set_tools(tools) - sandbox_manager.initialize() - agent.default_options["instructions"] = build_codeact_instructions( - tools=tools, - tools_visible_to_model=True, - ) - - print("=" * 60) - print("CodeAct direct tool sample") - print("=" * 60) - print(f"runtime_tool_count={len(runtime_tools)}") - print(f"User: {DEFAULT_PROMPT}") - result = await agent.run(DEFAULT_PROMPT, tools=runtime_tools) - print(f"Agent: {result.text}") - - -""" -Sample output (shape only): - -Sandbox initialized and snapshotted (...) -============================================================ -CodeAct direct tool sample -============================================================ -runtime_tool_count=0 -User: Fetch all users, find admins, multiply 6*7, and print the users, admins, -and multiplication result. Use one execute_code call. -Agent: ... - -Notes: -- Add tools to `runtime_tools` before calling `agent.run(...)` to expose them as - per-run tools and sandbox callbacks. -- This sample prioritizes the intended API shape over confirmed Hyperlight - runtime integration. -""" - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/python/uv.lock b/python/uv.lock index c22522d898..59e6307a19 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -44,6 +44,7 @@ members = [ "agent-framework-foundry", "agent-framework-foundry-local", "agent-framework-github-copilot", + "agent-framework-hyperlight", "agent-framework-lab", "agent-framework-mem0", "agent-framework-ollama", @@ -529,6 +530,25 @@ requires-dist = [ { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=0.2.1,<=0.2.1" }, ] +[[package]] +name = "agent-framework-hyperlight" +version = "1.0.0a260409" +source = { editable = "packages/hyperlight" } +dependencies = [ + { name = "agent-framework-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "hyperlight-sandbox", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "hyperlight-sandbox-backend-wasm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "hyperlight-sandbox-python-guest", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, +] + +[package.metadata] +requires-dist = [ + { name = "agent-framework-core", editable = "packages/core" }, + { name = "hyperlight-sandbox", specifier = ">=0.3.0,<0.4" }, + { name = "hyperlight-sandbox-backend-wasm", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = ">=0.3.0,<0.4" }, + { name = "hyperlight-sandbox-python-guest", specifier = ">=0.3.0,<0.4" }, +] + [[package]] name = "agent-framework-lab" version = "1.0.0b260409" @@ -2677,6 +2697,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, ] +[[package]] +name = "hyperlight-sandbox" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cf/fe/ce88996ea3e3e05130d6f0e8cd2ffbe9ab9bf3d9448b7050d4b8d0802b0a/hyperlight_sandbox-0.3.0.tar.gz", hash = "sha256:00491ce267ffbdb206377c79b4afd86510177ad73f4daf2ef7fce02b54eaf801", size = 9251, upload-time = "2026-04-07T03:49:52.542Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/33/e6dcd6729308d13570ae2d3be0e476019a6f3fea387d7549bb1f77ce0408/hyperlight_sandbox-0.3.0-py3-none-any.whl", hash = "sha256:ba8e6779d64e9c187acd93456851ebafaed2f49380e5d132bc0906a4080d2217", size = 5723, upload-time = "2026-04-07T03:49:53.276Z" }, +] + +[[package]] +name = "hyperlight-sandbox-backend-wasm" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/91/c9d68cad7996fdd2f1facef1453156bdd8d52eefa976cc8c827c13029497/hyperlight_sandbox_backend_wasm-0.3.0-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:eda362f5f737b0823326290d7627c76ce0547a78e70f07f8c9d177e34622fc02", size = 3806454, upload-time = "2026-04-07T03:49:24.238Z" }, + { url = "https://files.pythonhosted.org/packages/9a/6f/6b2399a1caf59dd19b635d99ee1add0c975af7bc3317f5d0f1f9c3f90aa0/hyperlight_sandbox_backend_wasm-0.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:79347b7ae94f2786691b04cb52130dabc5991e0c03b42a24bad8adc766832655", size = 3283951, upload-time = "2026-04-07T03:49:17.137Z" }, + { url = "https://files.pythonhosted.org/packages/23/f2/b380c34a0ce8d486a05adb66757f98cca029e1fb1c96b1c29be0d25d3882/hyperlight_sandbox_backend_wasm-0.3.0-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:aff9eec4803fb535a140298e2632529f4150fcf3c6ea3ff2ae4571572a836116", size = 3806601, upload-time = "2026-04-07T03:49:22.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/5a/fb78cfd934e0523887b8d5b073b7b2aed3b545add21cda3aa95929ac1659/hyperlight_sandbox_backend_wasm-0.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:b6151704dd19862c9869b115752b4504b45d0b2eeb46aa9385a1a3b8be11cfa8", size = 3284164, upload-time = "2026-04-07T03:49:18.556Z" }, + { url = "https://files.pythonhosted.org/packages/21/bc/4e21f5c7ccd9307ac63a61c71b62a57ee4a9e6eec77fc72ff072907a21f5/hyperlight_sandbox_backend_wasm-0.3.0-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:cfd1d22ce221774d82a5174d268d56ff70fc1a23fb993a6491358b5d0ed169bf", size = 3802901, upload-time = "2026-04-07T03:49:19.845Z" }, + { url = "https://files.pythonhosted.org/packages/9a/41/646be9b0c7bb0f9192e45a77414673aa414eb316c92b5312efe6fb4ce802/hyperlight_sandbox_backend_wasm-0.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:229ab494a422f2de895a2a27ad6a6a2daed710ea062d7c213878bbe5f5b32fa7", size = 3281220, upload-time = "2026-04-07T03:49:21.368Z" }, + { url = "https://files.pythonhosted.org/packages/74/3a/f8ec4a41fffba4036dfc3cbddc3dfb6e87466b01afe1cb0a50cc6a0f0eed/hyperlight_sandbox_backend_wasm-0.3.0-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:b91905ee2ddd36a78b0dd13b1a62be99a995a45121587c111692591e40b36912", size = 3802789, upload-time = "2026-04-07T03:49:15.614Z" }, + { url = "https://files.pythonhosted.org/packages/3c/62/dfa8c15102f9b8ec5c3b5ffb54b99d60c75e7a6e4d00540757656bc5a5d8/hyperlight_sandbox_backend_wasm-0.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:eff682761c3b86abfe7e0d523ea0e6d5c7e8299302917c53918743b82c9d1ea2", size = 3280501, upload-time = "2026-04-07T03:49:13.939Z" }, +] + +[[package]] +name = "hyperlight-sandbox-python-guest" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/23/6a/f182c4315d31a98dd3b82f9274638e3adb399779584af93c5087bb2f814f/hyperlight_sandbox_python_guest-0.3.0.tar.gz", hash = "sha256:b1de5d8e87375dc6bef744ecd7ae2a7f43d5f6b913b4e990e9872bd439c0b19e", size = 21554625, upload-time = "2026-04-07T03:49:42.672Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/8e/4cd754928464f56528645c7421ccbb3fcbe45ad2542f899712b0f2f2c0e1/hyperlight_sandbox_python_guest-0.3.0-py3-none-any.whl", hash = "sha256:3c55a7420666ad9a208893dbdf7ad1b5c8ad4f3a94b1a56e64979719c7ce95c1", size = 21716481, upload-time = "2026-04-07T03:49:39.885Z" }, +] + [[package]] name = "idna" version = "3.11" From bc61298d2f0c7b6147ead9e4d6b4c7deb31d454a Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:13:26 +0200 Subject: [PATCH 07/17] Python: Limit Hyperlight wasm backend to Python <3.14 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/packages/hyperlight/pyproject.toml | 3 +-- .../hyperlight/tests/hyperlight/test_hyperlight_codeact.py | 5 +++++ python/uv.lock | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/packages/hyperlight/pyproject.toml b/python/packages/hyperlight/pyproject.toml index 1e0a75abc7..9884152043 100644 --- a/python/packages/hyperlight/pyproject.toml +++ b/python/packages/hyperlight/pyproject.toml @@ -19,13 +19,12 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", "Typing :: Typed", ] dependencies = [ "agent-framework-core>=1.0.0,<2", "hyperlight-sandbox>=0.3.0,<0.4", - "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; sys_platform == 'linux' or sys_platform == 'win32'", + "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; (sys_platform == 'linux' or sys_platform == 'win32') and python_version < '3.14'", "hyperlight-sandbox-python-guest>=0.3.0,<0.4", ] diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index 335121150d..0267c779df 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -37,6 +37,11 @@ def _hyperlight_integration_skip_reason() -> str | None: if enabled not in {"1", "true", "yes"}: return "Set RUN_HYPERLIGHT_INTEGRATION_TESTS=true to enable Hyperlight integration tests." + if sys.version_info >= (3, 14): + return ( + "Hyperlight integration tests require Python < 3.14 because hyperlight-sandbox-backend-wasm is unsupported." + ) + if sys.platform not in {"linux", "win32"}: return "Hyperlight integration tests require Linux or Windows runners." diff --git a/python/uv.lock b/python/uv.lock index 59e6307a19..5028347ed2 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -537,7 +537,7 @@ source = { editable = "packages/hyperlight" } dependencies = [ { name = "agent-framework-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "hyperlight-sandbox", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, - { name = "hyperlight-sandbox-backend-wasm", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, + { name = "hyperlight-sandbox-backend-wasm", marker = "(python_full_version < '3.14' and sys_platform == 'linux') or (python_full_version < '3.14' and sys_platform == 'win32')" }, { name = "hyperlight-sandbox-python-guest", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] @@ -545,7 +545,7 @@ dependencies = [ requires-dist = [ { name = "agent-framework-core", editable = "packages/core" }, { name = "hyperlight-sandbox", specifier = ">=0.3.0,<0.4" }, - { name = "hyperlight-sandbox-backend-wasm", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = ">=0.3.0,<0.4" }, + { name = "hyperlight-sandbox-backend-wasm", marker = "(python_full_version < '3.14' and sys_platform == 'linux') or (python_full_version < '3.14' and sys_platform == 'win32')", specifier = ">=0.3.0,<0.4" }, { name = "hyperlight-sandbox-python-guest", specifier = ">=0.3.0,<0.4" }, ] From 9f8e5f47f6f38186622c2244b6ceb26dd482298e Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:19:37 +0200 Subject: [PATCH 08/17] Python: Fix CI for Hyperlight CodeAct PR Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .worktrees/devui_datastar | 1 - .worktrees/issue-4675-duplicate-telemetry | 1 - .worktrees/issue-4676-a2a-sdk-update | 1 - python/packages/core/tests/core/test_agents.py | 2 -- 4 files changed, 5 deletions(-) delete mode 160000 .worktrees/devui_datastar delete mode 160000 .worktrees/issue-4675-duplicate-telemetry delete mode 160000 .worktrees/issue-4676-a2a-sdk-update diff --git a/.worktrees/devui_datastar b/.worktrees/devui_datastar deleted file mode 160000 index bf8d9672e1..0000000000 --- a/.worktrees/devui_datastar +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bf8d9672e147c42696a5a17b0ed37878196b6715 diff --git a/.worktrees/issue-4675-duplicate-telemetry b/.worktrees/issue-4675-duplicate-telemetry deleted file mode 160000 index 55cc6e85c0..0000000000 --- a/.worktrees/issue-4675-duplicate-telemetry +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 55cc6e85c08db4d7795a48e85261655efd895409 diff --git a/.worktrees/issue-4676-a2a-sdk-update b/.worktrees/issue-4676-a2a-sdk-update deleted file mode 160000 index c551983295..0000000000 --- a/.worktrees/issue-4676-a2a-sdk-update +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c5519832953763b847b7cacc515edb78cf50d28d diff --git a/python/packages/core/tests/core/test_agents.py b/python/packages/core/tests/core/test_agents.py index 1358a997a5..41cc6e4663 100644 --- a/python/packages/core/tests/core/test_agents.py +++ b/python/packages/core/tests/core/test_agents.py @@ -865,7 +865,6 @@ async def before_run(self, *, agent: Any, session: Any, context: Any, state: Any options=None, compaction_strategy=None, tokenizer=None, - legacy_kwargs={}, function_invocation_kwargs=None, client_kwargs=None, ) @@ -895,7 +894,6 @@ async def before_run(self, *, agent: Any, session: Any, context: Any, state: Any options=None, compaction_strategy=None, tokenizer=None, - legacy_kwargs={}, function_invocation_kwargs=None, client_kwargs=None, ) From 2e0b8b7d7408ae1917def36b471b69fbc64cca63 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:28:49 +0200 Subject: [PATCH 09/17] Python: Run Hyperlight integration when available Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../hyperlight/tests/hyperlight/test_hyperlight_codeact.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index 0267c779df..bff3ec5831 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -7,7 +7,6 @@ import importlib.util import inspect import json -import os import sys import threading from collections.abc import Awaitable, Callable, Mapping, MutableSequence @@ -33,10 +32,6 @@ def _hyperlight_integration_skip_reason() -> str | None: - enabled = os.getenv("RUN_HYPERLIGHT_INTEGRATION_TESTS", "").strip().lower() - if enabled not in {"1", "true", "yes"}: - return "Set RUN_HYPERLIGHT_INTEGRATION_TESTS=true to enable Hyperlight integration tests." - if sys.version_info >= (3, 14): return ( "Hyperlight integration tests require Python < 3.14 because hyperlight-sandbox-backend-wasm is unsupported." From 48fc7750bc5c9d2f2a71166dba9eee06cfc847ea Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:47:14 +0200 Subject: [PATCH 10/17] Python: Address Hyperlight review feedback Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/decisions/0024-codeact-integration.md | 2 +- docs/features/code_act/python-implementation.md | 16 ++++++++-------- .../_execute_code_tool.py | 11 +++++------ .../agent_framework_hyperlight/py.typed | 1 + .../tests/hyperlight/test_hyperlight_codeact.py | 16 ++++++++++++++++ 5 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 python/packages/hyperlight/agent_framework_hyperlight/py.typed diff --git a/docs/decisions/0024-codeact-integration.md b/docs/decisions/0024-codeact-integration.md index 309b092db3..aa835051d6 100644 --- a/docs/decisions/0024-codeact-integration.md +++ b/docs/decisions/0024-codeact-integration.md @@ -11,7 +11,7 @@ informed: ## Context and Problem Statement -We need a architecture design that supports CodeAct in both Python and .NET. This is a necessary capability for the current generation of long running agents, which need to plan, iterate, transform tool outputs, and execute bounded code inside a controlled runtime instead of pushing every intermediate step back through the model. The design should preserve the same behavioral contract across SDKs, but it does not need to use the same internal extension point in each runtime. We also want to standardize on Hyperlight as the initial backend, using the existing Python package and an anticipated .NET binding package once it is available. +We need an architecture design that supports CodeAct in both Python and .NET. This is a necessary capability for the current generation of long-running agents, which need to plan, iterate, transform tool outputs, and execute bounded code inside a controlled runtime instead of pushing every intermediate step back through the model. The design should preserve the same behavioral contract across SDKs, but it does not need to use the same internal extension point in each runtime. We also want to standardize on Hyperlight as the initial backend, using the existing Python package and an anticipated .NET binding package once it is available. Throughout this ADR, **CodeAct** is the primary term. **Code mode** and **programmatic tool calling** refer to the same capability. This ADR uses **CodeAct** consistently. diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md index 195d31a1b8..cfd257b92f 100644 --- a/docs/features/code_act/python-implementation.md +++ b/docs/features/code_act/python-implementation.md @@ -60,41 +60,41 @@ There is no separate runtime setup object in the Python design. CodeAct tools, f Preferred pattern: - `add_tools(...) -> None` - `get_tools() -> Sequence[ToolTypes]` -- `remove_tools(...) -> None` +- `remove_tool(...) -> None` - `clear_tools() -> None` - `add_file_mounts(...) -> None` - `get_file_mounts() -> Sequence[FileMount]` -- `remove_file_mounts(...) -> None` +- `remove_file_mount(...) -> None` - `clear_file_mounts() -> None` - `add_allowed_domains(...) -> None` - `get_allowed_domains() -> Sequence[str]` -- `remove_allowed_domains(...) -> None` +- `remove_allowed_domain(...) -> None` - `clear_allowed_domains() -> None` - `add_allowed_http_methods(...) -> None` - `get_allowed_http_methods() -> Sequence[str]` -- `remove_allowed_http_methods(...) -> None` +- `remove_allowed_http_method(...) -> None` - `clear_allowed_http_methods() -> None` Requirements: - The provider-owned CodeAct tool registry is keyed by tool name. - `add_tools(...)` adds new tools and replaces an existing provider-owned registration when the same tool name is added again. - `get_tools()` returns the provider's current configured CodeAct tool registry. -- `remove_tools(...)` removes provider-owned CodeAct tools by name. +- `remove_tool(...)` removes provider-owned CodeAct tools by name. - `clear_tools()` removes all provider-owned CodeAct tools. - File mounts are keyed by sandbox mount path. - `add_file_mounts(...)` adds new file mounts and replaces an existing mount when the same mount path is added again. - `get_file_mounts()` returns the provider's current configured file mounts. -- `remove_file_mounts(...)` removes file mounts by mount path. +- `remove_file_mount(...)` removes file mounts by mount path. - `clear_file_mounts()` removes all configured file mounts. - Allowed domains are keyed by normalized domain string. - `add_allowed_domains(...)` adds domains to the outbound allow list. - `get_allowed_domains()` returns the current outbound domain allow list. -- `remove_allowed_domains(...)` removes domains from the outbound allow list. +- `remove_allowed_domain(...)` removes domains from the outbound allow list. - `clear_allowed_domains()` removes all configured allowed domains. - Allowed HTTP methods are keyed by normalized method name. - `add_allowed_http_methods(...)` adds methods to the outbound method allow list. - `get_allowed_http_methods()` returns the current outbound method allow list. -- `remove_allowed_http_methods(...)` removes methods from the outbound method allow list. +- `remove_allowed_http_method(...)` removes methods from the outbound method allow list. - `clear_allowed_http_methods()` removes all configured allowed HTTP methods. - Tool, file-mount, and network-allow-list mutations affect subsequent runs only; runs already in progress keep the snapshot captured at run start. - The provider must snapshot its effective tool registry and capability state at the start of each run so concurrent execution remains deterministic. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index 0ed9e695c7..361b0797dd 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -109,20 +109,19 @@ def _passthrough_result_parser(result: Any) -> str: def _collect_tools(*tool_groups: Any) -> list[FunctionTool]: - tools: list[FunctionTool] = [] - seen_names: set[str] = set() + tools_by_name: dict[str, FunctionTool] = {} for tool_group in tool_groups: normalized_group = normalize_tools(tool_group) for tool_obj in normalized_group: if not isinstance(tool_obj, FunctionTool): continue - if tool_obj.name == "execute_code" or tool_obj.name in seen_names: + if tool_obj.name == "execute_code": continue - seen_names.add(tool_obj.name) - tools.append(tool_obj) + tools_by_name.pop(tool_obj.name, None) + tools_by_name[tool_obj.name] = tool_obj - return tools + return list(tools_by_name.values()) def _resolve_execute_code_approval_mode( diff --git a/python/packages/hyperlight/agent_framework_hyperlight/py.typed b/python/packages/hyperlight/agent_framework_hyperlight/py.typed new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/python/packages/hyperlight/agent_framework_hyperlight/py.typed @@ -0,0 +1 @@ + diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index bff3ec5831..fb27eb7651 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -70,6 +70,11 @@ def dangerous_compute(a: int, b: int) -> int: return a * b +@tool(name="compute", approval_mode="always_require") +def replacement_compute(a: int, b: int) -> int: + return a - b + + @dataclass(slots=True) class _FakeResult: success: bool @@ -250,6 +255,17 @@ def test_execute_code_tool_updates_approval_with_managed_tools() -> None: assert execute_code.approval_mode == "always_require" +def test_execute_code_tool_replaces_tools_with_the_same_name() -> None: + execute_code = HyperlightExecuteCodeTool(tools=[compute], _registry=_FakeRuntime()) + + execute_code.add_tools(replacement_compute) + + tools = execute_code.get_tools() + assert len(tools) == 1 + assert tools[0] is replacement_compute + assert execute_code.approval_mode == "always_require" + + def test_execute_code_tool_requires_enabled_capabilities(tmp_path: Path) -> None: execute_code = HyperlightExecuteCodeTool(_registry=_FakeRuntime()) mount = FileMount(host_path=tmp_path, mount_path="/input/data") From 4b564cf6e835b84021afe46aaadf3192c40d81aa Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:53:36 +0200 Subject: [PATCH 11/17] Python: Simplify Hyperlight file mount inputs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../code_act/python-implementation.md | 16 ++++-- python/packages/hyperlight/README.md | 4 ++ .../agent_framework_hyperlight/__init__.py | 3 +- .../_execute_code_tool.py | 52 ++++++++++++++----- .../agent_framework_hyperlight/_provider.py | 6 +-- .../agent_framework_hyperlight/_types.py | 12 ++--- .../hyperlight/test_hyperlight_codeact.py | 24 ++++++++- 7 files changed, 86 insertions(+), 31 deletions(-) diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md index cfd257b92f..c84efcc8a8 100644 --- a/docs/features/code_act/python-implementation.md +++ b/docs/features/code_act/python-implementation.md @@ -144,11 +144,12 @@ Caching rules: #### Core types ```python -@dataclass(frozen=True) -class FileMount: - host_path: str | Path +class FileMount(NamedTuple): + host_path: str mount_path: str +FileMountInput = str | tuple[str, str] | FileMount + class HyperlightCodeActProvider(ContextProvider): def __init__( @@ -162,7 +163,7 @@ class HyperlightCodeActProvider(ContextProvider): approval_mode: Literal["always_require", "never_require"] = "never_require", filesystem_mode: Literal["none", "read_only", "read_write"] = "none", workspace_root: Path | None = None, - file_mounts: Sequence[FileMount] = (), + file_mounts: Sequence[FileMountInput] = (), network_mode: Literal["none", "allow_list"] = "none", allowed_domains: Sequence[str] = (), allowed_http_methods: Sequence[str] = (), @@ -172,7 +173,7 @@ class HyperlightCodeActProvider(ContextProvider): def get_tools(self) -> Sequence[ToolTypes]: ... def remove_tool(self, name: str) -> None: ... def clear_tools(self) -> None: ... - def add_file_mounts(self, mounts: FileMount | Sequence[FileMount]) -> None: ... + def add_file_mounts(self, mounts: FileMountInput | Sequence[FileMountInput]) -> None: ... def get_file_mounts(self) -> Sequence[FileMount]: ... def remove_file_mount(self, mount_path: str) -> None: ... def clear_file_mounts(self) -> None: ... @@ -186,6 +187,11 @@ class HyperlightCodeActProvider(ContextProvider): def clear_allowed_http_methods(self) -> None: ... ``` +`file_mounts` accepts three equivalent input forms: +- `"data/report.csv"` uses the same relative path on the host and in the sandbox. +- `("fixtures/users.json", "data/users.json")` uses distinct host and sandbox paths. +- `FileMount("fixtures/users.json", "data/users.json")` is the named-tuple form of the explicit pair. + No public abstract `CodeActContextProvider` base or public `executor=` parameter is required for the initial Python API. The initial alpha package also exports a standalone `HyperlightExecuteCodeTool` diff --git a/python/packages/hyperlight/README.md b/python/packages/hyperlight/README.md index d4181808be..096b2ad2c4 100644 --- a/python/packages/hyperlight/README.md +++ b/python/packages/hyperlight/README.md @@ -18,9 +18,13 @@ create the sandbox. - `HyperlightCodeActProvider` - `HyperlightExecuteCodeTool` - `FileMount` +- `FileMountInput` ## Notes - This package is intentionally separate from `agent-framework-core` so CodeAct usage and installation remain optional. - Alpha-package samples live under `packages/hyperlight/samples/`. +- `file_mounts` accepts a single string shorthand, an explicit `(host_path, + mount_path)` pair, or a `FileMount` named tuple. Use the explicit two-value + form when the host path differs from the sandbox path. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/__init__.py b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py index 8bdc5a1467..2be9c2f7cb 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/__init__.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py @@ -6,7 +6,7 @@ from ._execute_code_tool import HyperlightExecuteCodeTool from ._provider import HyperlightCodeActProvider -from ._types import FileMount, FilesystemMode, NetworkMode +from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode try: __version__ = importlib.metadata.version(__name__) @@ -15,6 +15,7 @@ __all__ = [ "FileMount", + "FileMountInput", "FilesystemMode", "HyperlightCodeActProvider", "HyperlightExecuteCodeTool", diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index 361b0797dd..a8efb338f6 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from pathlib import Path, PurePosixPath from tempfile import TemporaryDirectory -from typing import Annotated, Any, Protocol +from typing import Annotated, Any, Protocol, TypeGuard, cast from urllib.parse import urlparse from agent_framework import Content, FunctionTool @@ -19,7 +19,7 @@ from pydantic import BaseModel, Field from ._instructions import build_codeact_instructions, build_execute_code_description -from ._types import FileMount, FilesystemMode, NetworkMode +from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode DEFAULT_HYPERLIGHT_BACKEND = "wasm" DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" @@ -152,6 +152,30 @@ def _resolve_workspace_root(value: str | Path | None) -> Path | None: return resolved_path +def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[str, str]]: + if not isinstance(value, tuple): + return False + + value_tuple = cast(tuple[Any, ...], value) + if len(value_tuple) != 2: + return False + + host_path, mount_path = value_tuple + return isinstance(host_path, str) and isinstance(mount_path, str) + + +def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: + if isinstance(file_mount, str): + host_path, mount_path = file_mount, file_mount + else: + host_path, mount_path = file_mount + + return _StoredFileMount( + host_path=_resolve_existing_path(host_path), + mount_path=_normalize_mount_path(mount_path), + ) + + def _normalize_domain(target: str) -> str: candidate = target.strip() if not candidate: @@ -394,7 +418,7 @@ def __init__( approval_mode: ApprovalMode | None = None, filesystem_mode: FilesystemMode = "none", workspace_root: str | Path | None = None, - file_mounts: FileMount | Sequence[FileMount] | None = None, + file_mounts: FileMountInput | Sequence[FileMountInput] | None = None, network_mode: NetworkMode = "none", allowed_domains: str | Sequence[str] | None = None, allowed_http_methods: str | Sequence[str] | None = None, @@ -488,19 +512,19 @@ def clear_tools(self) -> None: self._managed_tools = [] self._refresh_approval_mode() - def add_file_mounts(self, file_mounts: FileMount | Sequence[FileMount]) -> None: - """Add one or more file mounts under `/input`.""" + def add_file_mounts(self, file_mounts: FileMountInput | Sequence[FileMountInput]) -> None: + """Add one or more file mounts under `/input`. + + A single string uses the same relative path on the host and in the sandbox. + Use a two-string tuple or `FileMount` when those paths differ. + """ if self._filesystem_mode == "none": raise ValueError("File mounts require filesystem_mode to be 'read_only' or 'read_write'.") - mounts = [file_mounts] if isinstance(file_mounts, FileMount) else list(file_mounts) - normalized_mounts = [ - _StoredFileMount( - host_path=_resolve_existing_path(mount.host_path), - mount_path=_normalize_mount_path(mount.mount_path), - ) - for mount in mounts - ] + mounts = ( + [file_mounts] if isinstance(file_mounts, str) or _is_file_mount_pair(file_mounts) else list(file_mounts) + ) + normalized_mounts = [_normalize_file_mount_input(mount) for mount in mounts] with self._state_lock: for mount in normalized_mounts: @@ -510,7 +534,7 @@ def get_file_mounts(self) -> list[FileMount]: """Return the configured file mounts.""" with self._state_lock: return [ - FileMount(host_path=mount.host_path, mount_path=_display_mount_path(mount.mount_path)) + FileMount(host_path=str(mount.host_path), mount_path=_display_mount_path(mount.mount_path)) for mount in self._file_mounts.values() ] diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_provider.py b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py index e6820dc440..55e0974d93 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_provider.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py @@ -10,7 +10,7 @@ from agent_framework._tools import ApprovalMode from ._execute_code_tool import HyperlightExecuteCodeTool, SandboxRuntime -from ._types import FileMount, FilesystemMode, NetworkMode +from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode class HyperlightCodeActProvider(ContextProvider): @@ -26,7 +26,7 @@ def __init__( approval_mode: ApprovalMode | None = None, filesystem_mode: FilesystemMode = "none", workspace_root: str | Path | None = None, - file_mounts: FileMount | Sequence[FileMount] | None = None, + file_mounts: FileMountInput | Sequence[FileMountInput] | None = None, network_mode: NetworkMode = "none", allowed_domains: str | Sequence[str] | None = None, allowed_http_methods: str | Sequence[str] | None = None, @@ -70,7 +70,7 @@ def clear_tools(self) -> None: """Remove all provider-owned sandbox tools.""" self._execute_code_tool.clear_tools() - def add_file_mounts(self, file_mounts: FileMount | Sequence[FileMount]) -> None: + def add_file_mounts(self, file_mounts: FileMountInput | Sequence[FileMountInput]) -> None: """Add provider-managed file mounts.""" self._execute_code_tool.add_file_mounts(file_mounts) diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_types.py b/python/packages/hyperlight/agent_framework_hyperlight/_types.py index 91c363130e..be25eb0c24 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_types.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_types.py @@ -2,17 +2,17 @@ from __future__ import annotations -from dataclasses import dataclass -from pathlib import Path -from typing import Literal +from typing import Literal, NamedTuple, TypeAlias FilesystemMode = Literal["none", "read_only", "read_write"] NetworkMode = Literal["none", "allow_list"] -@dataclass(frozen=True, slots=True) -class FileMount: +class FileMount(NamedTuple): """Map a host file or directory into the sandbox input tree.""" - host_path: str | Path + host_path: str mount_path: str + + +FileMountInput: TypeAlias = str | tuple[str, str] | FileMount diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index fb27eb7651..2eb1fb96ef 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -266,9 +266,29 @@ def test_execute_code_tool_replaces_tools_with_the_same_name() -> None: assert execute_code.approval_mode == "always_require" +def test_execute_code_tool_accepts_string_and_tuple_file_mounts( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + shorthand_file = tmp_path / "notes.txt" + shorthand_file.write_text("hello", encoding="utf-8") + explicit_file = tmp_path / "data.json" + explicit_file.write_text('{"hello": "world"}', encoding="utf-8") + monkeypatch.chdir(tmp_path) + + execute_code = HyperlightExecuteCodeTool(filesystem_mode="read_only", _registry=_FakeRuntime()) + execute_code.add_file_mounts("notes.txt") + execute_code.add_file_mounts((str(explicit_file), "data/data.json")) + + assert execute_code.get_file_mounts() == [ + FileMount(str(shorthand_file.resolve()), "/input/notes.txt"), + FileMount(str(explicit_file.resolve()), "/input/data/data.json"), + ] + + def test_execute_code_tool_requires_enabled_capabilities(tmp_path: Path) -> None: execute_code = HyperlightExecuteCodeTool(_registry=_FakeRuntime()) - mount = FileMount(host_path=tmp_path, mount_path="/input/data") + mount = (str(tmp_path), "data") with pytest.raises(ValueError, match="filesystem_mode"): execute_code.add_file_mounts(mount) @@ -288,7 +308,7 @@ def test_execute_code_tool_description_contains_call_tool_guidance(tmp_path: Pat tools=[compute], filesystem_mode="read_write", workspace_root=workspace_root, - file_mounts=[FileMount(host_path=mount_file, mount_path="/input/data/data.json")], + file_mounts=[FileMount(str(mount_file), "data/data.json")], network_mode="allow_list", allowed_domains=["https://api.example.com/v1"], allowed_http_methods=["get"], From e8eb62abafa3f7d00c0237148c75af0903d616b2 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 16:59:02 +0200 Subject: [PATCH 12/17] Python: Accept Path host paths in Hyperlight mounts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../features/code_act/python-implementation.md | 8 ++++---- python/packages/hyperlight/README.md | 5 +++-- .../_execute_code_tool.py | 18 ++++++++++-------- .../agent_framework_hyperlight/_types.py | 6 ++++-- .../hyperlight/test_hyperlight_codeact.py | 6 +++--- 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md index c84efcc8a8..8740d22358 100644 --- a/docs/features/code_act/python-implementation.md +++ b/docs/features/code_act/python-implementation.md @@ -145,10 +145,10 @@ Caching rules: ```python class FileMount(NamedTuple): - host_path: str + host_path: str | Path mount_path: str -FileMountInput = str | tuple[str, str] | FileMount +FileMountInput = str | tuple[str | Path, str] | FileMount class HyperlightCodeActProvider(ContextProvider): @@ -189,8 +189,8 @@ class HyperlightCodeActProvider(ContextProvider): `file_mounts` accepts three equivalent input forms: - `"data/report.csv"` uses the same relative path on the host and in the sandbox. -- `("fixtures/users.json", "data/users.json")` uses distinct host and sandbox paths. -- `FileMount("fixtures/users.json", "data/users.json")` is the named-tuple form of the explicit pair. +- `("fixtures/users.json", "data/users.json")` or `(Path("fixtures/users.json"), "data/users.json")` uses distinct host and sandbox paths. +- `FileMount(Path("fixtures/users.json"), "data/users.json")` is the named-tuple form of the explicit pair. No public abstract `CodeActContextProvider` base or public `executor=` parameter is required for the initial Python API. diff --git a/python/packages/hyperlight/README.md b/python/packages/hyperlight/README.md index 096b2ad2c4..396075259c 100644 --- a/python/packages/hyperlight/README.md +++ b/python/packages/hyperlight/README.md @@ -26,5 +26,6 @@ create the sandbox. usage and installation remain optional. - Alpha-package samples live under `packages/hyperlight/samples/`. - `file_mounts` accepts a single string shorthand, an explicit `(host_path, - mount_path)` pair, or a `FileMount` named tuple. Use the explicit two-value - form when the host path differs from the sandbox path. + mount_path)` pair, or a `FileMount` named tuple. The host-side path in the + explicit forms may be a `str` or `Path`. Use the explicit two-value form when + the host path differs from the sandbox path. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index a8efb338f6..6a5113fee4 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -19,7 +19,7 @@ from pydantic import BaseModel, Field from ._instructions import build_codeact_instructions, build_execute_code_description -from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode +from ._types import FileMount, FileMountHostPath, FileMountInput, FilesystemMode, NetworkMode DEFAULT_HYPERLIGHT_BACKEND = "wasm" DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" @@ -152,7 +152,7 @@ def _resolve_workspace_root(value: str | Path | None) -> Path | None: return resolved_path -def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[str, str]]: +def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[FileMountHostPath, str]]: if not isinstance(value, tuple): return False @@ -161,7 +161,7 @@ def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[str, str]]: return False host_path, mount_path = value_tuple - return isinstance(host_path, str) and isinstance(mount_path, str) + return isinstance(host_path, (str, Path)) and isinstance(mount_path, str) def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: @@ -521,10 +521,12 @@ def add_file_mounts(self, file_mounts: FileMountInput | Sequence[FileMountInput] if self._filesystem_mode == "none": raise ValueError("File mounts require filesystem_mode to be 'read_only' or 'read_write'.") - mounts = ( - [file_mounts] if isinstance(file_mounts, str) or _is_file_mount_pair(file_mounts) else list(file_mounts) - ) - normalized_mounts = [_normalize_file_mount_input(mount) for mount in mounts] + if isinstance(file_mounts, str) or _is_file_mount_pair(file_mounts): + normalized_mounts = [_normalize_file_mount_input(file_mounts)] + else: + normalized_mounts = [ + _normalize_file_mount_input(mount) for mount in cast(Sequence[FileMountInput], file_mounts) + ] with self._state_lock: for mount in normalized_mounts: @@ -534,7 +536,7 @@ def get_file_mounts(self) -> list[FileMount]: """Return the configured file mounts.""" with self._state_lock: return [ - FileMount(host_path=str(mount.host_path), mount_path=_display_mount_path(mount.mount_path)) + FileMount(host_path=mount.host_path, mount_path=_display_mount_path(mount.mount_path)) for mount in self._file_mounts.values() ] diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_types.py b/python/packages/hyperlight/agent_framework_hyperlight/_types.py index be25eb0c24..84081f9f8f 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_types.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_types.py @@ -2,6 +2,7 @@ from __future__ import annotations +from pathlib import Path from typing import Literal, NamedTuple, TypeAlias FilesystemMode = Literal["none", "read_only", "read_write"] @@ -11,8 +12,9 @@ class FileMount(NamedTuple): """Map a host file or directory into the sandbox input tree.""" - host_path: str + host_path: str | Path mount_path: str -FileMountInput: TypeAlias = str | tuple[str, str] | FileMount +FileMountHostPath: TypeAlias = str | Path +FileMountInput: TypeAlias = str | tuple[FileMountHostPath, str] | FileMount diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index 2eb1fb96ef..f35d4f99c6 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -278,11 +278,11 @@ def test_execute_code_tool_accepts_string_and_tuple_file_mounts( execute_code = HyperlightExecuteCodeTool(filesystem_mode="read_only", _registry=_FakeRuntime()) execute_code.add_file_mounts("notes.txt") - execute_code.add_file_mounts((str(explicit_file), "data/data.json")) + execute_code.add_file_mounts((explicit_file, "data/data.json")) assert execute_code.get_file_mounts() == [ - FileMount(str(shorthand_file.resolve()), "/input/notes.txt"), - FileMount(str(explicit_file.resolve()), "/input/data/data.json"), + FileMount(shorthand_file.resolve(), "/input/notes.txt"), + FileMount(explicit_file.resolve(), "/input/data/data.json"), ] From 4c6f7da2e0c3b1b168ab22b13da3bcd43f41cac4 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Thu, 9 Apr 2026 17:12:07 +0200 Subject: [PATCH 13/17] Python: Fix Hyperlight mount typing for CI Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework_hyperlight/_execute_code_tool.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index 6a5113fee4..d39a56a8bb 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -156,7 +156,7 @@ def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[FileMountHost if not isinstance(value, tuple): return False - value_tuple = cast(tuple[Any, ...], value) + value_tuple = cast(tuple[object, ...], value) if len(value_tuple) != 2: return False @@ -165,10 +165,14 @@ def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[FileMountHost def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: + host_path: FileMountHostPath + mount_path: str if isinstance(file_mount, str): - host_path, mount_path = file_mount, file_mount + host_path = file_mount + mount_path = file_mount else: - host_path, mount_path = file_mount + host_path = file_mount[0] + mount_path = file_mount[1] return _StoredFileMount( host_path=_resolve_existing_path(host_path), From f89d65a8929496a004d618c51e9e7d83927a1639 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 10 Apr 2026 08:51:55 +0200 Subject: [PATCH 14/17] temp run integration test --- .../hyperlight/tests/hyperlight/test_hyperlight_codeact.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index f35d4f99c6..ab18d81d89 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -404,8 +404,7 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_fake_sandbox(monkey assert "compute" in _FakeSandbox.instances[0].registered_tools -@pytest.mark.flaky -@pytest.mark.integration +# @pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> None: client = _FakeCodeActChatClient() From ef6e1b4dfe225c7a90205d342c16d1147e617273 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 10 Apr 2026 09:03:13 +0200 Subject: [PATCH 15/17] Python: Strengthen Hyperlight real sandbox tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../hyperlight/test_hyperlight_codeact.py | 127 ++++++++++++++++-- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index ab18d81d89..d35e715a33 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -9,8 +9,10 @@ import json import sys import threading -from collections.abc import Awaitable, Callable, Mapping, MutableSequence +from collections.abc import Awaitable, Callable, Iterator, Mapping, MutableSequence +from contextlib import contextmanager from dataclasses import dataclass +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path from typing import Any @@ -191,6 +193,68 @@ def extend_tools(self, source_id: str, tools: list[Any]) -> None: self.tools.append((source_id, tools)) +def _extract_execute_code_result(function_result: Content) -> Content: + assert function_result.type == "function_result" + assert function_result.exception is None, ( + f"execute_code raised {function_result.exception!r} with items={function_result.items!r}" + ) + + code_result = next( + (item for item in function_result.items or [] if item.type == "code_interpreter_tool_result"), + None, + ) + if code_result is not None: + return code_result + + text_outputs = [item for item in function_result.items or [] if item.type == "text"] + if text_outputs: + return Content.from_code_interpreter_tool_result(outputs=text_outputs) + + if function_result.result: + return Content.from_code_interpreter_tool_result(outputs=[Content.from_text(function_result.result)]) + + raise AssertionError(f"execute_code returned no usable outputs: {function_result.items!r}") + + +def _extract_text_output(result_content: Content) -> str: + code_result = _extract_execute_code_result(result_content) + text_output = next( + (item for item in code_result.outputs or [] if item.type == "text" and item.text is not None), None + ) + assert text_output is not None and text_output.text is not None, ( + f"Expected text output from execute_code, got {code_result.outputs!r}" + ) + return text_output.text + + +@contextmanager +def _serve_http_text_response(body: bytes) -> Iterator[tuple[str, list[str]]]: + requests: list[str] = [] + + class _Handler(BaseHTTPRequestHandler): + def do_GET(self) -> None: # noqa: N802 + requests.append(self.path) + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format: str, *args: Any) -> None: + return + + server = ThreadingHTTPServer(("127.0.0.1", 0), _Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + try: + yield f"127.0.0.1:{server.server_port}", requests + finally: + server.shutdown() + server.server_close() + thread.join() + + class _FakeCodeActChatClient(FunctionInvocationLayer[Any], BaseChatClient[Any]): def __init__(self) -> None: FunctionInvocationLayer.__init__(self) @@ -234,13 +298,7 @@ async def _get_response() -> ChatResponse: result_content = function_results[0] assert result_content.call_id == "execute_code_call" - - code_result = next( - item for item in result_content.items or [] if item.type == "code_interpreter_tool_result" - ) - text_output = next(item for item in code_result.outputs or [] if item.type == "text") - assert text_output.text == "42\n" - assert result_content.exception is None + assert _extract_text_output(result_content) == "42\n" return ChatResponse(messages=Message(role="assistant", contents=["The sandbox returned 42."])) @@ -404,7 +462,7 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_fake_sandbox(monkey assert "compute" in _FakeSandbox.instances[0].registered_tools -# @pytest.mark.integration +@pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> None: client = _FakeCodeActChatClient() @@ -415,3 +473,54 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> N assert response.text == "The sandbox returned 42." assert client.call_count == 2 + + +@pytest.mark.integration +@skip_if_hyperlight_integration_tests_disabled +async def test_provider_run_tool_reads_writes_files_and_accesses_allowed_url_with_real_sandbox( + tmp_path: Path, +) -> None: + mounted_file = tmp_path / "mounted.txt" + mounted_file.write_text("hello from mount", encoding="utf-8") + + with _serve_http_text_response(b"network ok") as (allowed_host, requests): + provider = HyperlightCodeActProvider( + filesystem_mode="read_write", + network_mode="allow_list", + ) + provider.add_file_mounts((mounted_file, "data/input.txt")) + provider.add_allowed_domains(allowed_host) + provider.add_allowed_http_methods("GET") + + context = _FakeSessionContext() + state: dict[str, Any] = {} + await provider.before_run(agent=object(), session=None, context=context, state=state) + + run_tool = context.tools[0][1][0] + assert isinstance(run_tool, HyperlightExecuteCodeTool) + + result = await run_tool.invoke( + arguments={ + "code": ( + "from pathlib import Path\n" + "from urllib.request import urlopen\n\n" + 'input_text = Path("/input/data/input.txt").read_text(encoding="utf-8")\n' + 'Path("/output/result.txt").write_text(input_text.upper(), encoding="utf-8")\n' + f'with urlopen("http://{allowed_host}/allowed", timeout=10) as response:\n' + ' network_text = response.read().decode("utf-8")\n' + "print(input_text)\n" + "print(network_text)\n" + ) + } + ) + + assert result[0].type == "code_interpreter_tool_result" + outputs = result[0].outputs or [] + + text_output = next(item for item in outputs if item.type == "text" and item.text is not None) + assert text_output.text == "hello from mount\nnetwork ok\n" + + file_output = next(item for item in outputs if item.type == "data") + assert file_output.data == b"HELLO FROM MOUNT" + assert file_output.additional_properties["path"] == "/output/result.txt" + assert requests == ["/allowed"] From 1867b4d32938d8319255269a19c90641cd46aef3 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 10 Apr 2026 09:36:38 +0200 Subject: [PATCH 16/17] added additional tests --- .../hyperlight/tests/hyperlight/test_hyperlight_codeact.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index d35e715a33..ac69dea65b 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -462,7 +462,6 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_fake_sandbox(monkey assert "compute" in _FakeSandbox.instances[0].registered_tools -@pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> None: client = _FakeCodeActChatClient() @@ -475,7 +474,6 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> N assert client.call_count == 2 -@pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_provider_run_tool_reads_writes_files_and_accesses_allowed_url_with_real_sandbox( tmp_path: Path, From 764c15a9403b83759eed348fcc11d0f753a3a5d9 Mon Sep 17 00:00:00 2001 From: eavanvalkenburg Date: Fri, 10 Apr 2026 10:29:52 +0200 Subject: [PATCH 17/17] Python: Simplify Hyperlight CodeAct API Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/decisions/0024-codeact-integration.md | 28 ++- .../code_act/python-implementation.md | 94 +++++---- python/packages/hyperlight/README.md | 5 + .../agent_framework_hyperlight/__init__.py | 6 +- .../_execute_code_tool.py | 196 +++++++++--------- .../_instructions.py | 53 ++--- .../agent_framework_hyperlight/_provider.py | 38 +--- .../agent_framework_hyperlight/_types.py | 16 +- .../hyperlight/test_hyperlight_codeact.py | 46 ++-- 9 files changed, 231 insertions(+), 251 deletions(-) diff --git a/docs/decisions/0024-codeact-integration.md b/docs/decisions/0024-codeact-integration.md index aa835051d6..fda4d3cb7d 100644 --- a/docs/decisions/0024-codeact-integration.md +++ b/docs/decisions/0024-codeact-integration.md @@ -15,6 +15,8 @@ We need an architecture design that supports CodeAct in both Python and .NET. Th Throughout this ADR, **CodeAct** is the primary term. **Code mode** and **programmatic tool calling** refer to the same capability. This ADR uses **CodeAct** consistently. +Model-generated code is treated as untrusted relative to the host process. This ADR assumes the selected backend provides the primary isolation boundary, while the framework is responsible for configuring approvals and capabilities, integrating telemetry, and translating outputs and failures into framework-native shapes. If a backend cannot provide isolation appropriate for its trust model, it is not a suitable CodeAct backend. + The core design question is: **where should CodeAct integrate into the agent pipeline so that both SDKs can offer the same functionality without invasive changes to their core function-calling loops?** ## Decision Drivers @@ -22,6 +24,7 @@ The core design question is: **where should CodeAct integrate into the agent pip - CodeAct must shape the model-facing surface before model invocation, not only after the model has already chosen tools. - The design should let users control which tools are available through CodeAct and which remain regular tools only. - The design must preserve existing session, approval, telemetry, and tool invocation behavior as much as possible. +- The design should define the minimum cross-SDK telemetry and failure semantics for `execute_code`, so Python and .NET do not diverge on basic observability or error handling. - The design must fit naturally into the extension points that already exist in each SDK. - The design must be safe for concurrent runs and must not rely on mutating shared agent configuration during invocation. - The chosen structure should allow multiple backend-specific providers to fit under the same conceptual design over time, even though Hyperlight is the initial target. @@ -145,16 +148,20 @@ We standardize the **public concept** of CodeAct across SDKs while allowing each - Python uses a `ContextProvider`. - .NET uses an `AIContextProvider`. - The term **CodeAct context provider** is used throughout this ADR as a design concept, not as a required public base type. Public SDK APIs should prefer concrete backend-specific types such as `HyperlightCodeActProvider` rather than a public abstract `CodeActContextProvider` or a public `CodeActExecutor` parameter. -- CodeAct support should ship as an optional package in each SDK rather than as part of the core package, so users who do not need CodeAct do not take on its installation and dependency footprint. +- CodeAct support should ship as an optional package in each SDK rather than as part of the core package, so users who do not need CodeAct do not take on its installation and dependency footprint. That optional package may still depend on a few small, backward-compatible hooks in the host SDK's core agent pipeline. - There is no separate runtime setup object in the chosen design. Concrete providers manage their provider-owned CodeAct tool registry, file mounts, and outbound network allow-list configuration directly through CRUD-style methods on the provider itself. - At a high level, CodeAct is exposed through backend-specific context providers that contribute an `execute_code` tool, own the CodeAct-specific tool registry, and carry backend capability configuration such as filesystem and network access. - The initial approval model is bundled approval for `execute_code`, using the same `approval_mode="always_require" | "never_require"` vocabulary as regular tools. -- The CodeAct provider exposes a default `approval_mode` for `execute_code`. If the provider default is `never_require`, the effective approval for `execute_code` is derived from the provider-owned CodeAct tool registry captured for the run. +- The CodeAct provider exposes a default `approval_mode` for `execute_code`. If the provider default is `always_require`, `execute_code` is always treated as `always_require` regardless of the provider-owned tool registry. If the provider default is `never_require`, the effective approval for `execute_code` is derived from the provider-owned CodeAct tool registry captured for the run. - If every provider-owned CodeAct tool in that registry has `approval_mode="never_require"`, `execute_code` is treated as `never_require`. If any provider-owned CodeAct tool in that registry has `approval_mode="always_require"`, `execute_code` is treated as `always_require`, even if the generated code may not end up calling that tool. - Approval is granted before `execute_code` starts, and provider-owned tool calls made from inside that execution run under the same approval. - Direct-only agent tools do not affect the approval of `execute_code`; only the provider-owned CodeAct tool registry participates in that calculation. +- This approval model is intentionally conservative. If one sensitive provider-owned tool forces `execute_code` to require approval more often than desired, the mitigation is to keep that tool direct-only or split it into a different provider/tool surface rather than trying to infer per-run tool usage up front. - Configuring filesystem and network capability state on the provider, including adding file mounts or outbound network allow-list entries, is itself the approval for those capabilities in the initial model. -- Each `execute_code` invocation must start from a clean execution state. Exact caching, snapshot, and environment-reuse strategies are implementation details defined in the language-specific specs. +- Each `execute_code` invocation must start from a clean execution state; in-memory variables and other ephemeral interpreter/runtime state must not persist across separate calls. When a provider exposes a workspace, mounted files, or a writable artifact/output area, those files are the supported persistence mechanism across calls and are treated as external state rather than interpreter state. +- Mutating the provider's tool registry or capability configuration while a run is in flight is allowed, but it only affects subsequent runs. Provider implementations must snapshot the effective state for each run and synchronize concurrent access so shared provider instances remain safe across concurrent runs. +- The minimum cross-SDK telemetry contract is that `execute_code` is traced as a normal tool invocation nested inside the surrounding agent run, and provider-owned tool calls made from inside CodeAct continue to emit ordinary tool-invocation telemetry. Backend-specific resource metrics are optional extensions, not a required new top-level cross-SDK event model. +- Timeout, out-of-memory, backend crash, and similar sandbox failures are all execution failures of `execute_code` and should surface as structured error results rather than backend-specific public DTOs. Partial textual or file outputs may be returned only when the backend can report them unambiguously; callers must not rely on partial-output recovery as a portable guarantee. - The provider-based structure preserves room for future pre-execution inspection and nested per-tool approvals if later experience shows they are needed. - Concrete backend-specific providers may still use small SDK-local helpers or adapters internally, but that split is an implementation detail rather than a public API requirement. @@ -163,18 +170,25 @@ Detailed language-specific implementation notes are specified in: - [Python implementation](../features/code_act/python-implementation.md) - [.NET implementation](../features/code_act/dotnet-implementation.md) +### Minimal core hooks required by the optional package + +CodeAct remains optional at the package level, but the optional package depends on a small number of hooks that must live in the host SDK because the agent pipeline owns model invocation and per-run tool resolution. + +- Python depends on the existing `ContextProvider` lifecycle, `SessionContext.extend_instructions(...)`, `SessionContext.extend_tools(...)`, per-run runtime tool access via `SessionContext.options["tools"]`, and the shared `ApprovalMode` vocabulary used by `FunctionTool`. +- .NET depends on the existing `AIContextProvider` seam, agent/runtime support for applying providers before model invocation, and the existing chat-client or function-invocation seams that concrete implementations use to contribute `execute_code`. + +These hooks are backward-compatible because they only expose or forward per-run state that core already owns. Behavior changes only when a concrete CodeAct provider opts in and uses them. + ### Concrete provider implementation contract The design does not require a public abstract `CodeActContextProvider` base class, but it does require a stable implementation contract for concrete providers. - Concrete providers should expose a standard capability surface at construction time, with SDK-appropriate naming for: - approval mode - - filesystem mode - workspace root - file mounts - - network mode - - allowed outbound domains - - allowed HTTP methods or an equivalent outbound policy surface + - allowed outbound targets plus any per-target method or policy restrictions needed by the backend +- Separate public `filesystem_mode` / `network_mode` flags are not required by the cross-SDK contract. Filesystem access may be disabled implicitly until a workspace or file mounts are configured, and outbound network may be disabled implicitly until an allow-list or equivalent outbound policy entry is configured. - Concrete providers should expose direct CRUD-style methods for managing the provider-owned CodeAct tool registry, file mounts, and outbound network allow-list configuration, rather than requiring callers to construct a separate runtime setup object. - Concrete providers should implement their host SDK's provider lifecycle hooks to: - build CodeAct instructions, diff --git a/docs/features/code_act/python-implementation.md b/docs/features/code_act/python-implementation.md index 8740d22358..d5a4a3b018 100644 --- a/docs/features/code_act/python-implementation.md +++ b/docs/features/code_act/python-implementation.md @@ -13,7 +13,7 @@ Goals: - Developers can configure a provider-owned CodeAct tool set that is separate from the agent's direct `tools=` surface. - Developers can use the same `execute_code` concept for both tool-enabled CodeAct and a standard code interpreter tool implementation. - Developers can swap execution backends over time, starting with Hyperlight while keeping room for alternatives such as Pydantic's Monty. -- Developers can configure execution capabilities such as file access, workspace mounts, and outbound network allow lists in a portable way. +- Developers can configure execution capabilities such as workspace mounts and outbound network allow lists in a portable way. Success Metric: - Python samples exist for both a tool-enabled CodeAct mode and a standard interpreter mode. @@ -26,7 +26,7 @@ Implementation-free outcome: - Today, the easiest way to prototype CodeAct is to infer or reshape the agent's direct tool surface, which is fragile and hard to reason about. - In Python, inferring a CodeAct tool surface from generic agent tool configuration is fragile and hard to reason about. - There is no first-class Python design that simultaneously covers Hyperlight-backed CodeAct now, future backend-specific providers such as Monty, and both tool-enabled and interpreter modes. -- Sandbox capabilities such as file access and network access need a portable configuration model instead of ad hoc backend-specific wiring. +- Sandbox capabilities such as mounted file access and outbound network access need a portable configuration model instead of ad hoc backend-specific wiring. - Approval behavior needs to be explicit and configurable, especially when CodeAct and direct tool calling may both be available. ## API Changes @@ -67,13 +67,9 @@ Preferred pattern: - `remove_file_mount(...) -> None` - `clear_file_mounts() -> None` - `add_allowed_domains(...) -> None` -- `get_allowed_domains() -> Sequence[str]` +- `get_allowed_domains() -> Sequence[AllowedDomain]` - `remove_allowed_domain(...) -> None` - `clear_allowed_domains() -> None` -- `add_allowed_http_methods(...) -> None` -- `get_allowed_http_methods() -> Sequence[str]` -- `remove_allowed_http_method(...) -> None` -- `clear_allowed_http_methods() -> None` Requirements: - The provider-owned CodeAct tool registry is keyed by tool name. @@ -86,16 +82,11 @@ Requirements: - `get_file_mounts()` returns the provider's current configured file mounts. - `remove_file_mount(...)` removes file mounts by mount path. - `clear_file_mounts()` removes all configured file mounts. -- Allowed domains are keyed by normalized domain string. -- `add_allowed_domains(...)` adds domains to the outbound allow list. -- `get_allowed_domains()` returns the current outbound domain allow list. -- `remove_allowed_domain(...)` removes domains from the outbound allow list. -- `clear_allowed_domains()` removes all configured allowed domains. -- Allowed HTTP methods are keyed by normalized method name. -- `add_allowed_http_methods(...)` adds methods to the outbound method allow list. -- `get_allowed_http_methods()` returns the current outbound method allow list. -- `remove_allowed_http_method(...)` removes methods from the outbound method allow list. -- `clear_allowed_http_methods()` removes all configured allowed HTTP methods. +- Allowed domains are keyed by normalized target string. +- `add_allowed_domains(...)` adds allow-list entries and replaces an existing entry when the same target is added again. +- `get_allowed_domains()` returns the current outbound allow-list entries. +- `remove_allowed_domain(...)` removes allow-list entries by target. +- `clear_allowed_domains()` removes all configured allow-list entries. - Tool, file-mount, and network-allow-list mutations affect subsequent runs only; runs already in progress keep the snapshot captured at run start. - The provider must snapshot its effective tool registry and capability state at the start of each run so concurrent execution remains deterministic. @@ -120,6 +111,8 @@ Effective `execute_code` approval is computed as follows: This is intentionally conservative and matches the shape of the current function-tool approval flow, where `FunctionTool` uses `always_require` / `never_require` and the auto-invocation loop escalates the whole batch if any called tool requires approval. +If one sensitive provider-owned tool causes `execute_code` to require approval more often than desired, the mitigation is to keep that tool direct-only or expose it through a different CodeAct provider/tool surface. The initial model does not try to infer whether generated code will actually call that tool before approval. + If the framework later standardizes pre-execution inspection or nested per-tool approvals, the Python provider surface can grow to expose that explicitly. The initial design does not assume that those extra modes are required. #### Shared execution flow @@ -138,6 +131,8 @@ Caching rules: - Backends that support snapshots may cache a reusable clean snapshot. - Backends that do not support snapshots may still cache warm initialization artifacts. - No mutable per-run execution state may be shared across concurrent runs. +- In-memory interpreter state does not persist across separate `execute_code` calls. +- Configured workspace files, mounted files, and any writable artifact/output area are the supported persistence mechanism across calls when the backend exposes them. ### Python public API @@ -151,6 +146,14 @@ class FileMount(NamedTuple): FileMountInput = str | tuple[str | Path, str] | FileMount +class AllowedDomain(NamedTuple): + target: str + methods: tuple[str, ...] | None = None + + +AllowedDomainInput = str | tuple[str, str | Sequence[str]] | AllowedDomain + + class HyperlightCodeActProvider(ContextProvider): def __init__( self, @@ -161,12 +164,9 @@ class HyperlightCodeActProvider(ContextProvider): module_path: str | None = None, tools: ToolTypes | None = None, approval_mode: Literal["always_require", "never_require"] = "never_require", - filesystem_mode: Literal["none", "read_only", "read_write"] = "none", workspace_root: Path | None = None, file_mounts: Sequence[FileMountInput] = (), - network_mode: Literal["none", "allow_list"] = "none", - allowed_domains: Sequence[str] = (), - allowed_http_methods: Sequence[str] = (), + allowed_domains: Sequence[AllowedDomainInput] = (), ) -> None: ... def add_tools(self, tools: ToolTypes | Sequence[ToolTypes]) -> None: ... @@ -177,14 +177,10 @@ class HyperlightCodeActProvider(ContextProvider): def get_file_mounts(self) -> Sequence[FileMount]: ... def remove_file_mount(self, mount_path: str) -> None: ... def clear_file_mounts(self) -> None: ... - def add_allowed_domains(self, domains: str | Sequence[str]) -> None: ... - def get_allowed_domains(self) -> Sequence[str]: ... + def add_allowed_domains(self, domains: AllowedDomainInput | Sequence[AllowedDomainInput]) -> None: ... + def get_allowed_domains(self) -> Sequence[AllowedDomain]: ... def remove_allowed_domain(self, domain: str) -> None: ... def clear_allowed_domains(self) -> None: ... - def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: ... - def get_allowed_http_methods(self) -> Sequence[str]: ... - def remove_allowed_http_method(self, method: str) -> None: ... - def clear_allowed_http_methods(self) -> None: ... ``` `file_mounts` accepts three equivalent input forms: @@ -192,6 +188,11 @@ class HyperlightCodeActProvider(ContextProvider): - `("fixtures/users.json", "data/users.json")` or `(Path("fixtures/users.json"), "data/users.json")` uses distinct host and sandbox paths. - `FileMount(Path("fixtures/users.json"), "data/users.json")` is the named-tuple form of the explicit pair. +`allowed_domains` accepts three equivalent input forms: +- `"github.com"` allows that target with all backend-supported methods. +- `("github.com", "GET")` or `("github.com", ["GET", "HEAD"])` uses an explicit per-target method list. +- `AllowedDomain("github.com", ("GET", "HEAD"))` is the named-tuple form of the explicit entry. + No public abstract `CodeActContextProvider` base or public `executor=` parameter is required for the initial Python API. The initial alpha package also exports a standalone `HyperlightExecuteCodeTool` @@ -208,6 +209,13 @@ Provider modes: The concrete provider plugs into the existing Python `ContextProvider` surface from `agent_framework._sessions`. +The Hyperlight package also depends on a small set of core hooks that must remain available from `agent-framework-core`: +- `ContextProvider.before_run(...)` +- `SessionContext.extend_instructions(...)` +- `SessionContext.extend_tools(...)` +- per-run runtime tool access via `SessionContext.options["tools"]` +- the shared `ApprovalMode` vocabulary used by `FunctionTool` + Required lifecycle hook: - `before_run(*, agent, session, context, state) -> None` @@ -223,6 +231,8 @@ Optional lifecycle hook: If the provider stores anything in `state`, that value must stay JSON-serializable. +Mutating the provider after `before_run(...)` has captured a run-scoped snapshot is allowed, but it affects subsequent runs only. Provider implementations should synchronize state capture and CRUD operations so shared provider instances remain safe across concurrent runs. + `after_run(...)` is responsible for any backend-specific cleanup or post-processing that must happen after the model invocation completes. If shared internal helpers are introduced later for multiple concrete providers, they should standardize responsibilities for: @@ -244,6 +254,7 @@ If shared internal helpers are introduced later for multiple concrete providers, - The provider does not inspect or mutate `Agent.default_options["tools"]` or `context.options["tools"]` to determine its CodeAct tool set. - The provider snapshots the current CodeAct tool registry and capability state at run start, so later registry and allow-list mutations only affect future runs. - Interpreter versus tool-enabled behavior is derived from the concrete provider and the presence of CodeAct-managed tools, not from a separate public profile object. +- `execute_code` should be traced like a normal tool invocation within the surrounding agent run, and provider-owned tool calls executed through `call_tool(...)` should continue to emit ordinary tool invocation telemetry. #### Backend integration @@ -254,7 +265,7 @@ Backend-specific notes: - **Hyperlight** - Provider construction needs a guest artifact via `module`, which may be a packaged guest module name or a path to a compiled guest artifact. - File access maps naturally to Hyperlight Sandbox's read-only `/input` and writable `/output` capability model. - - Network access is denied by default and is enabled through allow-listed domains plus HTTP verbs. + - Network access is denied by default and is enabled through per-target allow-list entries. - **Monty** - A future `MontyCodeActProvider` should be a separate public type rather than a `HyperlightCodeActProvider` mode. - Monty does not expose built-in filesystem or network access directly inside the interpreter. @@ -263,27 +274,23 @@ Backend-specific notes: #### Capability handling -Capabilities are first-class `HyperlightCodeActProvider` init parameters and, for collection-shaped state, provider-managed CRUD surfaces: -- `filesystem_mode` +Capabilities are first-class `HyperlightCodeActProvider` init parameters and provider-managed CRUD surfaces: - `workspace_root` - `file_mounts` -- `network_mode` - `allowed_domains` -- `allowed_http_methods` Concrete providers should normalize these settings internally. Hyperlight can map them directly to sandbox capabilities, while Monty must enforce them through host-mediated file and network functions and may apply stricter URL-level checks than the public provider surface expresses. Expected management split: -- scalar policy settings such as `filesystem_mode`, `workspace_root`, and `network_mode` remain direct configuration values on the provider, +- `workspace_root` remains a direct configuration value on the provider, - file mounts are managed through provider CRUD methods, -- outbound domains are managed through provider CRUD methods, -- outbound HTTP methods are managed through provider CRUD methods. +- outbound allow-list entries are managed through provider CRUD methods. Enabling access means: -- `filesystem_mode="none"` disables file access from sandboxed code. -- `filesystem_mode="read_only"` or `"read_write"` enables file access within the mounted/workspace surface exposed by the provider. -- `network_mode="none"` disables outbound network access. -- `network_mode="allow_list"` enables outbound access only for the configured `allowed_domains` and `allowed_http_methods`. +- Configuring `workspace_root` or any `file_mounts` enables the sandbox filesystem surface exposed through `/input` and `/output`. +- Leaving both `workspace_root` and `file_mounts` unset means no filesystem surface is configured. +- Adding any `allowed_domains` entry enables outbound access only for the configured targets; leaving it empty means network access is disabled without a separate `network_mode` flag. +- A string target allows all backend-supported methods for that target; an explicit tuple or `AllowedDomain` entry narrows the methods for that target. Backends may implement stricter semantics than these top-level settings. For example, Hyperlight naturally maps file access to `/input` and `/output`, while Monty would enforce equivalent policy through host-provided callbacks rather than direct interpreter I/O. @@ -315,6 +322,8 @@ Use the existing content model from `agent_framework._types`, for example: Execution failures should surface readable error text and structured error `Content`, not a custom backend result object. +Timeouts, out-of-memory conditions, backend crashes, and similar sandbox failures are all `execute_code` failures and should surface as structured error content. Partial textual or file outputs may be returned only when the backend can report them unambiguously; callers should not rely on partial-output recovery as a portable contract. + ## E2E Code Samples ### Tool-enabled CodeAct mode @@ -322,11 +331,8 @@ Execution failures should surface readable error text and structured error `Cont ```python codeact = HyperlightCodeActProvider( tools=[fetch_docs, query_data], - filesystem_mode="read_write", workspace_root="./workdir", - network_mode="allow_list", - allowed_domains=["api.github.com"], - allowed_http_methods=["GET"], + allowed_domains=[("api.github.com", "GET")], ) codeact.add_tools([lookup_user]) @@ -342,9 +348,7 @@ agent = Agent( ```python code_interpreter = HyperlightCodeActProvider( - filesystem_mode="read_only", workspace_root="./data", - network_mode="none", ) agent = Agent( diff --git a/python/packages/hyperlight/README.md b/python/packages/hyperlight/README.md index 396075259c..cfe2a02022 100644 --- a/python/packages/hyperlight/README.md +++ b/python/packages/hyperlight/README.md @@ -15,6 +15,8 @@ create the sandbox. ## Public API +- `AllowedDomain` +- `AllowedDomainInput` - `HyperlightCodeActProvider` - `HyperlightExecuteCodeTool` - `FileMount` @@ -29,3 +31,6 @@ create the sandbox. mount_path)` pair, or a `FileMount` named tuple. The host-side path in the explicit forms may be a `str` or `Path`. Use the explicit two-value form when the host path differs from the sandbox path. +- `allowed_domains` accepts a single string target such as `"github.com"` to + allow all backend-supported methods, an explicit `(target, method_or_methods)` + tuple such as `("github.com", "GET")`, or an `AllowedDomain` named tuple. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/__init__.py b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py index 2be9c2f7cb..511252d0df 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/__init__.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/__init__.py @@ -6,7 +6,7 @@ from ._execute_code_tool import HyperlightExecuteCodeTool from ._provider import HyperlightCodeActProvider -from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode +from ._types import AllowedDomain, AllowedDomainInput, FileMount, FileMountInput try: __version__ = importlib.metadata.version(__name__) @@ -14,11 +14,11 @@ __version__ = "0.0.0" __all__ = [ + "AllowedDomain", + "AllowedDomainInput", "FileMount", "FileMountInput", - "FilesystemMode", "HyperlightCodeActProvider", "HyperlightExecuteCodeTool", - "NetworkMode", "__version__", ] diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index d39a56a8bb..875faa3413 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -19,7 +19,7 @@ from pydantic import BaseModel, Field from ._instructions import build_codeact_instructions, build_execute_code_description -from ._types import FileMount, FileMountHostPath, FileMountInput, FilesystemMode, NetworkMode +from ._types import AllowedDomain, AllowedDomainInput, FileMount, FileMountHostPath, FileMountInput DEFAULT_HYPERLIGHT_BACKEND = "wasm" DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" @@ -50,18 +50,19 @@ class _RunConfig: module_path: str | None approval_mode: ApprovalMode tools: tuple[FunctionTool, ...] - filesystem_mode: FilesystemMode workspace_root: Path | None workspace_signature: tuple[tuple[str, int, int], ...] file_mounts: tuple[_NormalizedFileMount, ...] - network_mode: NetworkMode - allowed_domains: tuple[str, ...] - allowed_http_methods: tuple[str, ...] + allowed_domains: tuple[AllowedDomain, ...] @property def mounted_paths(self) -> tuple[str, ...]: return tuple(_display_mount_path(mount.mount_path) for mount in self.file_mounts) + @property + def filesystem_enabled(self) -> bool: + return self.workspace_root is not None or bool(self.file_mounts) + def cache_key(self) -> tuple[Any, ...]: return ( self.backend, @@ -69,13 +70,10 @@ def cache_key(self) -> tuple[Any, ...]: self.module_path, self.approval_mode, tuple((tool_obj.name, id(tool_obj)) for tool_obj in self.tools), - self.filesystem_mode, str(self.workspace_root) if self.workspace_root is not None else None, self.workspace_signature, tuple((mount.mount_path, str(mount.host_path), mount.path_signature) for mount in self.file_mounts), - self.network_mode, - self.allowed_domains, - self.allowed_http_methods, + tuple((allowed_domain.target, allowed_domain.methods) for allowed_domain in self.allowed_domains), ) @@ -183,12 +181,12 @@ def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: def _normalize_domain(target: str) -> str: candidate = target.strip() if not candidate: - raise ValueError("Domain entries must not be empty.") + raise ValueError("Allowed domain entries must not be empty.") parsed = urlparse(candidate if "://" in candidate else f"//{candidate}") normalized = (parsed.netloc or parsed.path).strip().rstrip("/") if not normalized: - raise ValueError(f"Could not normalize domain entry: {target!r}.") + raise ValueError(f"Could not normalize allowed domain entry: {target!r}.") return normalized.lower() @@ -199,6 +197,53 @@ def _normalize_http_method(method: str) -> str: return normalized +def _normalize_http_methods(methods: str | Sequence[str] | None) -> tuple[str, ...] | None: + if methods is None: + return None + + normalized_methods = ( + {_normalize_http_method(methods)} + if isinstance(methods, str) + else {_normalize_http_method(method) for method in methods} + ) + if not normalized_methods: + raise ValueError("Allowed domain methods must not be empty when provided.") + return tuple(sorted(normalized_methods)) + + +def _is_allowed_domain_pair(value: Any) -> TypeGuard[tuple[str, str | Sequence[str]]]: + if not isinstance(value, tuple) or isinstance(value, AllowedDomain): + return False + + value_tuple = cast(tuple[object, ...], value) + if len(value_tuple) != 2: + return False + + target, methods = value_tuple + if not isinstance(target, str): + return False + if isinstance(methods, str): + return True + return isinstance(methods, Sequence) + + +def _normalize_allowed_domain_input(allowed_domain: AllowedDomainInput) -> AllowedDomain: + if isinstance(allowed_domain, str): + return AllowedDomain(target=_normalize_domain(allowed_domain), methods=None) + + if isinstance(allowed_domain, AllowedDomain): + return AllowedDomain( + target=_normalize_domain(allowed_domain.target), + methods=_normalize_http_methods(allowed_domain.methods), + ) + + target, methods = allowed_domain + return AllowedDomain( + target=_normalize_domain(target), + methods=_normalize_http_methods(methods), + ) + + def _normalize_mount_path(mount_path: str) -> str: raw_path = mount_path.strip().replace("\\", "/") if not raw_path: @@ -372,8 +417,8 @@ def execute(self, *, config: _RunConfig, code: str) -> list[Content]: return _build_execution_contents(result=result, sandbox=entry.sandbox, output_dir=entry.output_dir) def _create_entry(self, config: _RunConfig) -> _SandboxEntry: - input_dir_handle = TemporaryDirectory() if config.filesystem_mode != "none" else None - output_dir_handle = TemporaryDirectory() if config.filesystem_mode == "read_write" else None + input_dir_handle = TemporaryDirectory() if config.filesystem_enabled else None + output_dir_handle = TemporaryDirectory() if config.filesystem_enabled else None if input_dir_handle is not None: _populate_input_dir(config=config, input_root=Path(input_dir_handle.name)) @@ -396,10 +441,11 @@ def _create_entry(self, config: _RunConfig) -> _SandboxEntry: for tool_obj in config.tools: sandbox.register_tool(tool_obj.name, _make_sandbox_callback(tool_obj)) - if config.network_mode == "allow_list": - methods = list(config.allowed_http_methods) or None - for domain in config.allowed_domains: - sandbox.allow_domain(domain, methods=methods) + for allowed_domain in config.allowed_domains: + sandbox.allow_domain( + allowed_domain.target, + methods=list(allowed_domain.methods) if allowed_domain.methods is not None else None, + ) sandbox.run("None") snapshot = sandbox.snapshot() @@ -420,12 +466,9 @@ def __init__( *, tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]] | None = None, approval_mode: ApprovalMode | None = None, - filesystem_mode: FilesystemMode = "none", workspace_root: str | Path | None = None, file_mounts: FileMountInput | Sequence[FileMountInput] | None = None, - network_mode: NetworkMode = "none", - allowed_domains: str | Sequence[str] | None = None, - allowed_http_methods: str | Sequence[str] | None = None, + allowed_domains: AllowedDomainInput | Sequence[AllowedDomainInput] | None = None, backend: str = DEFAULT_HYPERLIGHT_BACKEND, module: str | None = DEFAULT_HYPERLIGHT_MODULE, module_path: str | None = None, @@ -441,25 +484,18 @@ def __init__( self._state_lock = threading.RLock() self._registry = _registry or _SandboxRegistry() self._default_approval_mode: ApprovalMode = approval_mode or "never_require" - self._filesystem_mode: FilesystemMode = filesystem_mode self._workspace_root = _resolve_workspace_root(workspace_root) - if self._filesystem_mode == "none" and self._workspace_root is not None: - raise ValueError("workspace_root requires filesystem_mode to be 'read_only' or 'read_write'.") - self._network_mode: NetworkMode = network_mode self._backend: str = backend self._module: str | None = module self._module_path: str | None = module_path self._managed_tools: list[FunctionTool] = [] self._file_mounts: dict[str, _StoredFileMount] = {} - self._allowed_domains: set[str] = set() - self._allowed_http_methods: set[str] = set() + self._allowed_domains: dict[str, AllowedDomain] = {} if tools is not None: self.add_tools(tools) if file_mounts is not None: self.add_file_mounts(file_mounts) - if allowed_http_methods is not None: - self.add_allowed_http_methods(allowed_http_methods) if allowed_domains is not None: self.add_allowed_domains(allowed_domains) @@ -472,14 +508,13 @@ def description(self) -> str: return str(self.__dict__.get("description", EXECUTE_CODE_INPUT_DESCRIPTION)) with state_lock: + allowed_domains = sorted(self._allowed_domains.values(), key=lambda value: value.target) return build_execute_code_description( tools=self._managed_tools, - filesystem_mode=self._filesystem_mode, + filesystem_enabled=self._workspace_root is not None or bool(self._file_mounts), workspace_enabled=self._workspace_root is not None, mounted_paths=[_display_mount_path(mount.mount_path) for mount in self._file_mounts.values()], - network_mode=self._network_mode, - allowed_domains=sorted(self._allowed_domains), - allowed_http_methods=sorted(self._allowed_http_methods), + allowed_domains=allowed_domains, ) @description.setter @@ -522,9 +557,6 @@ def add_file_mounts(self, file_mounts: FileMountInput | Sequence[FileMountInput] A single string uses the same relative path on the host and in the sandbox. Use a two-string tuple or `FileMount` when those paths differ. """ - if self._filesystem_mode == "none": - raise ValueError("File mounts require filesystem_mode to be 'read_only' or 'read_write'.") - if isinstance(file_mounts, str) or _is_file_mount_pair(file_mounts): normalized_mounts = [_normalize_file_mount_input(file_mounts)] else: @@ -557,97 +589,56 @@ def clear_file_mounts(self) -> None: with self._state_lock: self._file_mounts.clear() - def add_allowed_domains(self, domains: str | Sequence[str]) -> None: - """Add one or more outbound allow-list domains.""" - if self._network_mode == "none": - raise ValueError("Allowed domains require network_mode='allow_list'.") + def add_allowed_domains(self, domains: AllowedDomainInput | Sequence[AllowedDomainInput]) -> None: + """Add one or more outbound allow-list entries.""" + if isinstance(domains, (str, AllowedDomain)) or _is_allowed_domain_pair(domains): + normalized_domains = [_normalize_allowed_domain_input(domains)] + else: + normalized_domains = [ + _normalize_allowed_domain_input(domain) for domain in cast(Sequence[AllowedDomainInput], domains) + ] - normalized_domains = ( - {_normalize_domain(domains)} - if isinstance(domains, str) - else {_normalize_domain(domain) for domain in domains} - ) with self._state_lock: - self._allowed_domains.update(normalized_domains) + for normalized_domain in normalized_domains: + self._allowed_domains[normalized_domain.target] = normalized_domain - def get_allowed_domains(self) -> list[str]: - """Return the configured outbound allow-list domains.""" + def get_allowed_domains(self) -> list[AllowedDomain]: + """Return the configured outbound allow-list entries.""" with self._state_lock: - return sorted(self._allowed_domains) + return sorted(self._allowed_domains.values(), key=lambda value: value.target) def remove_allowed_domain(self, domain: str) -> None: - """Remove one outbound allow-list domain.""" + """Remove one outbound allow-list entry.""" normalized_domain = _normalize_domain(domain) with self._state_lock: if normalized_domain not in self._allowed_domains: raise KeyError(f"No allowed domain exists for {domain!r}.") - self._allowed_domains.remove(normalized_domain) + del self._allowed_domains[normalized_domain] def clear_allowed_domains(self) -> None: - """Remove all outbound allow-list domains.""" + """Remove all outbound allow-list entries.""" with self._state_lock: self._allowed_domains.clear() - def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: - """Add one or more outbound HTTP methods for the allow-list policy.""" - if self._network_mode == "none": - raise ValueError("Allowed HTTP methods require network_mode='allow_list'.") - - normalized_methods = ( - {_normalize_http_method(methods)} - if isinstance(methods, str) - else {_normalize_http_method(method) for method in methods} - ) - with self._state_lock: - self._allowed_http_methods.update(normalized_methods) - - def get_allowed_http_methods(self) -> list[str]: - """Return the configured outbound allow-list HTTP methods.""" - with self._state_lock: - return sorted(self._allowed_http_methods) - - def remove_allowed_http_method(self, method: str) -> None: - """Remove one outbound allow-list HTTP method.""" - normalized_method = _normalize_http_method(method) - with self._state_lock: - if normalized_method not in self._allowed_http_methods: - raise KeyError(f"No allowed HTTP method exists for {method!r}.") - self._allowed_http_methods.remove(normalized_method) - - def clear_allowed_http_methods(self) -> None: - """Remove all outbound allow-list HTTP methods.""" - with self._state_lock: - self._allowed_http_methods.clear() - def build_instructions(self, *, tools_visible_to_model: bool) -> str: """Build the current CodeAct instructions for this execute_code surface.""" config = self._build_run_config() return build_codeact_instructions( tools=config.tools, tools_visible_to_model=tools_visible_to_model, - filesystem_mode=config.filesystem_mode, - workspace_enabled=config.workspace_root is not None, - mounted_paths=config.mounted_paths, - network_mode=config.network_mode, - allowed_domains=config.allowed_domains, - allowed_http_methods=config.allowed_http_methods, ) def create_run_tool(self) -> HyperlightExecuteCodeTool: """Create a run-scoped snapshot of this execute_code surface.""" file_mounts = self.get_file_mounts() allowed_domains = self.get_allowed_domains() - allowed_http_methods = self.get_allowed_http_methods() return HyperlightExecuteCodeTool( tools=self.get_tools(), approval_mode=self._default_approval_mode, - filesystem_mode=self._filesystem_mode, workspace_root=self._workspace_root, file_mounts=file_mounts or None, - network_mode=self._network_mode, allowed_domains=allowed_domains or None, - allowed_http_methods=allowed_http_methods or None, backend=self._backend, module=self._module, module_path=self._module_path, @@ -663,7 +654,7 @@ def build_serializable_state(self) -> dict[str, Any]: "module_path": config.module_path, "approval_mode": config.approval_mode, "tool_names": [tool_obj.name for tool_obj in config.tools], - "filesystem_mode": config.filesystem_mode, + "filesystem_enabled": config.filesystem_enabled, "workspace_root": str(config.workspace_root) if config.workspace_root is not None else None, "file_mounts": [ { @@ -672,9 +663,14 @@ def build_serializable_state(self) -> dict[str, Any]: } for mount in config.file_mounts ], - "network_mode": config.network_mode, - "allowed_domains": list(config.allowed_domains), - "allowed_http_methods": list(config.allowed_http_methods), + "network_enabled": bool(config.allowed_domains), + "allowed_domains": [ + { + "target": allowed_domain.target, + "methods": list(allowed_domain.methods) if allowed_domain.methods is not None else None, + } + for allowed_domain in config.allowed_domains + ], } def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: @@ -692,8 +688,7 @@ def _build_run_config(self) -> _RunConfig: managed_tools = tuple(self._managed_tools) workspace_root = self._workspace_root stored_mounts = tuple(self._file_mounts.values()) - allowed_domains = tuple(sorted(self._allowed_domains)) - allowed_http_methods = tuple(sorted(self._allowed_http_methods)) + allowed_domains = tuple(sorted(self._allowed_domains.values(), key=lambda value: value.target)) approval_mode = _resolve_execute_code_approval_mode( base_approval_mode=self._default_approval_mode, tools=managed_tools, @@ -715,13 +710,10 @@ def _build_run_config(self) -> _RunConfig: module_path=self._module_path, approval_mode=approval_mode, tools=managed_tools, - filesystem_mode=self._filesystem_mode, workspace_root=workspace_root, workspace_signature=workspace_signature, file_mounts=normalized_mounts, - network_mode=self._network_mode, allowed_domains=allowed_domains, - allowed_http_methods=allowed_http_methods, ) def _run_code(self, *, code: str) -> list[Content]: diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py b/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py index 77e7993e95..f866c1349c 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py @@ -6,7 +6,7 @@ from agent_framework import FunctionTool -from ._types import FilesystemMode, NetworkMode +from ._types import AllowedDomain def _format_tool_summaries(tools: Sequence[FunctionTool]) -> str: @@ -25,19 +25,16 @@ def _format_tool_summaries(tools: Sequence[FunctionTool]) -> str: def _format_filesystem_capabilities( *, - filesystem_mode: FilesystemMode, + filesystem_enabled: bool, workspace_enabled: bool, mounted_paths: Sequence[str], ) -> str: - if filesystem_mode == "none": - return "Filesystem access is disabled." + if not filesystem_enabled: + return "Filesystem access is unavailable because no workspace root or file mounts are configured." lines = ["Filesystem access is enabled."] lines.append("Read files from `/input`.") - if filesystem_mode == "read_write": - lines.append("Write generated artifacts to `/output`; returned files will be attached to the tool result.") - else: - lines.append("The sandbox does not expose a writable `/output` directory in this configuration.") + lines.append("Write generated artifacts to `/output`; returned files will be attached to the tool result.") if workspace_enabled: lines.append("The configured workspace root is available under `/input/`.") @@ -53,23 +50,17 @@ def _format_filesystem_capabilities( def _format_network_capabilities( *, - network_mode: NetworkMode, - allowed_domains: Sequence[str], - allowed_http_methods: Sequence[str], + allowed_domains: Sequence[AllowedDomain], ) -> str: - if network_mode == "none": - return "Outbound network access is disabled." - - methods_text = ", ".join(allowed_http_methods) if allowed_http_methods else "all methods allowed by the backend" if not allowed_domains: - return "Outbound network access uses an allow-list, but no domains are currently configured." - - lines = [ - "Outbound network access uses an allow-list.", - f"Allowed HTTP methods: {methods_text}.", - "Allowed domains:", - ] - lines.extend(f"- `{domain}`" for domain in allowed_domains) + return "Outbound network access is unavailable because no allow-listed targets are configured." + + lines = ["Outbound network access is allowed only for these configured targets:"] + for allowed_domain in allowed_domains: + methods_text = ( + ", ".join(allowed_domain.methods) if allowed_domain.methods else "all methods allowed by the backend" + ) + lines.append(f"- `{allowed_domain.target}`: {methods_text}.") return "\n".join(lines) @@ -77,12 +68,6 @@ def build_codeact_instructions( *, tools: Sequence[FunctionTool], tools_visible_to_model: bool, - filesystem_mode: FilesystemMode, - workspace_enabled: bool, - mounted_paths: Sequence[str], - network_mode: NetworkMode, - allowed_domains: Sequence[str], - allowed_http_methods: Sequence[str], ) -> str: """Build dynamic CodeAct instructions for the effective sandbox state.""" usage_note = ( @@ -105,23 +90,19 @@ def build_codeact_instructions( def build_execute_code_description( *, tools: Sequence[FunctionTool], - filesystem_mode: FilesystemMode, + filesystem_enabled: bool, workspace_enabled: bool, mounted_paths: Sequence[str], - network_mode: NetworkMode, - allowed_domains: Sequence[str], - allowed_http_methods: Sequence[str], + allowed_domains: Sequence[AllowedDomain], ) -> str: """Build the dynamic execute_code tool description for standalone usage.""" filesystem_text = _format_filesystem_capabilities( - filesystem_mode=filesystem_mode, + filesystem_enabled=filesystem_enabled, workspace_enabled=workspace_enabled, mounted_paths=mounted_paths, ) network_text = _format_network_capabilities( - network_mode=network_mode, allowed_domains=allowed_domains, - allowed_http_methods=allowed_http_methods, ) return f"""Execute Python in an isolated Hyperlight sandbox. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_provider.py b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py index 55e0974d93..1232ecc262 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_provider.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_provider.py @@ -10,7 +10,7 @@ from agent_framework._tools import ApprovalMode from ._execute_code_tool import HyperlightExecuteCodeTool, SandboxRuntime -from ._types import FileMount, FileMountInput, FilesystemMode, NetworkMode +from ._types import AllowedDomain, AllowedDomainInput, FileMount, FileMountInput class HyperlightCodeActProvider(ContextProvider): @@ -24,12 +24,9 @@ def __init__( *, tools: FunctionTool | Callable[..., Any] | Sequence[FunctionTool | Callable[..., Any]] | None = None, approval_mode: ApprovalMode | None = None, - filesystem_mode: FilesystemMode = "none", workspace_root: str | Path | None = None, file_mounts: FileMountInput | Sequence[FileMountInput] | None = None, - network_mode: NetworkMode = "none", - allowed_domains: str | Sequence[str] | None = None, - allowed_http_methods: str | Sequence[str] | None = None, + allowed_domains: AllowedDomainInput | Sequence[AllowedDomainInput] | None = None, backend: str = "wasm", module: str | None = "python_guest.path", module_path: str | None = None, @@ -39,12 +36,9 @@ def __init__( self._execute_code_tool = HyperlightExecuteCodeTool( tools=tools, approval_mode=approval_mode, - filesystem_mode=filesystem_mode, workspace_root=workspace_root, file_mounts=file_mounts, - network_mode=network_mode, allowed_domains=allowed_domains, - allowed_http_methods=allowed_http_methods, backend=backend, module=module, module_path=module_path, @@ -86,38 +80,22 @@ def clear_file_mounts(self) -> None: """Remove all provider-managed file mounts.""" self._execute_code_tool.clear_file_mounts() - def add_allowed_domains(self, domains: str | Sequence[str]) -> None: - """Add provider-managed outbound allow-list domains.""" + def add_allowed_domains(self, domains: AllowedDomainInput | Sequence[AllowedDomainInput]) -> None: + """Add provider-managed outbound allow-list entries.""" self._execute_code_tool.add_allowed_domains(domains) - def get_allowed_domains(self) -> list[str]: - """Return the provider-managed outbound allow-list domains.""" + def get_allowed_domains(self) -> list[AllowedDomain]: + """Return the provider-managed outbound allow-list entries.""" return self._execute_code_tool.get_allowed_domains() def remove_allowed_domain(self, domain: str) -> None: - """Remove one provider-managed outbound allow-list domain.""" + """Remove one provider-managed outbound allow-list entry.""" self._execute_code_tool.remove_allowed_domain(domain) def clear_allowed_domains(self) -> None: - """Remove all provider-managed outbound allow-list domains.""" + """Remove all provider-managed outbound allow-list entries.""" self._execute_code_tool.clear_allowed_domains() - def add_allowed_http_methods(self, methods: str | Sequence[str]) -> None: - """Add provider-managed outbound HTTP methods.""" - self._execute_code_tool.add_allowed_http_methods(methods) - - def get_allowed_http_methods(self) -> list[str]: - """Return the provider-managed outbound HTTP methods.""" - return self._execute_code_tool.get_allowed_http_methods() - - def remove_allowed_http_method(self, method: str) -> None: - """Remove one provider-managed outbound HTTP method.""" - self._execute_code_tool.remove_allowed_http_method(method) - - def clear_allowed_http_methods(self) -> None: - """Remove all provider-managed outbound HTTP methods.""" - self._execute_code_tool.clear_allowed_http_methods() - async def before_run( self, *, diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_types.py b/python/packages/hyperlight/agent_framework_hyperlight/_types.py index 84081f9f8f..8d202c8986 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_types.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_types.py @@ -2,11 +2,9 @@ from __future__ import annotations +from collections.abc import Sequence from pathlib import Path -from typing import Literal, NamedTuple, TypeAlias - -FilesystemMode = Literal["none", "read_only", "read_write"] -NetworkMode = Literal["none", "allow_list"] +from typing import NamedTuple, TypeAlias class FileMount(NamedTuple): @@ -18,3 +16,13 @@ class FileMount(NamedTuple): FileMountHostPath: TypeAlias = str | Path FileMountInput: TypeAlias = str | tuple[FileMountHostPath, str] | FileMount + + +class AllowedDomain(NamedTuple): + """Allow outbound requests to one target, optionally restricted to specific HTTP methods.""" + + target: str + methods: tuple[str, ...] | None = None + + +AllowedDomainInput: TypeAlias = str | tuple[str, str | Sequence[str]] | AllowedDomain diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index ac69dea65b..e8db02eb99 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -29,7 +29,7 @@ tool, ) -from agent_framework_hyperlight import FileMount, HyperlightCodeActProvider, HyperlightExecuteCodeTool +from agent_framework_hyperlight import AllowedDomain, FileMount, HyperlightCodeActProvider, HyperlightExecuteCodeTool from agent_framework_hyperlight import _execute_code_tool as execute_code_module @@ -324,7 +324,7 @@ def test_execute_code_tool_replaces_tools_with_the_same_name() -> None: assert execute_code.approval_mode == "always_require" -def test_execute_code_tool_accepts_string_and_tuple_file_mounts( +def test_execute_code_tool_accepts_string_and_tuple_file_mounts_without_mode_flags( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -334,7 +334,7 @@ def test_execute_code_tool_accepts_string_and_tuple_file_mounts( explicit_file.write_text('{"hello": "world"}', encoding="utf-8") monkeypatch.chdir(tmp_path) - execute_code = HyperlightExecuteCodeTool(filesystem_mode="read_only", _registry=_FakeRuntime()) + execute_code = HyperlightExecuteCodeTool(_registry=_FakeRuntime()) execute_code.add_file_mounts("notes.txt") execute_code.add_file_mounts((explicit_file, "data/data.json")) @@ -344,15 +344,19 @@ def test_execute_code_tool_accepts_string_and_tuple_file_mounts( ] -def test_execute_code_tool_requires_enabled_capabilities(tmp_path: Path) -> None: +def test_execute_code_tool_allowed_domains_use_structured_entries_and_replace_by_target() -> None: execute_code = HyperlightExecuteCodeTool(_registry=_FakeRuntime()) - mount = (str(tmp_path), "data") - with pytest.raises(ValueError, match="filesystem_mode"): - execute_code.add_file_mounts(mount) + execute_code.add_allowed_domains(["https://api.example.com/v1", ("github.com", "get")]) + execute_code.add_allowed_domains([ + AllowedDomain("api.example.com", ("post", "get")), + ("github.com", ["head", "get"]), + ]) - with pytest.raises(ValueError, match="network_mode"): - execute_code.add_allowed_domains("api.example.com") + assert execute_code.get_allowed_domains() == [ + AllowedDomain("api.example.com", ("GET", "POST")), + AllowedDomain("github.com", ("GET", "HEAD")), + ] def test_execute_code_tool_description_contains_call_tool_guidance(tmp_path: Path) -> None: @@ -364,12 +368,9 @@ def test_execute_code_tool_description_contains_call_tool_guidance(tmp_path: Pat execute_code = HyperlightExecuteCodeTool( tools=[compute], - filesystem_mode="read_write", workspace_root=workspace_root, file_mounts=[FileMount(str(mount_file), "data/data.json")], - network_mode="allow_list", - allowed_domains=["https://api.example.com/v1"], - allowed_http_methods=["get"], + allowed_domains=[AllowedDomain("https://api.example.com/v1", ("get", "post")), "github.com"], _registry=_FakeRuntime(), ) @@ -380,7 +381,8 @@ def test_execute_code_tool_description_contains_call_tool_guidance(tmp_path: Pat assert "/input/data/data.json" in description assert "/output" in description assert "api.example.com" in description - assert "GET" in description + assert "GET, POST" in description + assert "github.com" in description async def test_execute_code_tool_executes_with_structured_content(monkeypatch: pytest.MonkeyPatch) -> None: @@ -389,10 +391,8 @@ async def test_execute_code_tool_executes_with_structured_content(monkeypatch: p execute_code = HyperlightExecuteCodeTool( tools=[compute], - filesystem_mode="read_write", - network_mode="allow_list", - allowed_domains=["api.example.com"], - allowed_http_methods=["get"], + file_mounts=[FileMount(Path(__file__), "fixtures/source.py")], + allowed_domains=[("api.example.com", "get")], ) result = await execute_code.invoke(arguments={"code": "create-output"}) @@ -462,6 +462,7 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_fake_sandbox(monkey assert "compute" in _FakeSandbox.instances[0].registered_tools +@pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> None: client = _FakeCodeActChatClient() @@ -474,6 +475,7 @@ async def test_agent_runs_hyperlight_codeact_end_to_end_with_real_sandbox() -> N assert client.call_count == 2 +@pytest.mark.integration @skip_if_hyperlight_integration_tests_disabled async def test_provider_run_tool_reads_writes_files_and_accesses_allowed_url_with_real_sandbox( tmp_path: Path, @@ -482,13 +484,9 @@ async def test_provider_run_tool_reads_writes_files_and_accesses_allowed_url_wit mounted_file.write_text("hello from mount", encoding="utf-8") with _serve_http_text_response(b"network ok") as (allowed_host, requests): - provider = HyperlightCodeActProvider( - filesystem_mode="read_write", - network_mode="allow_list", - ) + provider = HyperlightCodeActProvider() provider.add_file_mounts((mounted_file, "data/input.txt")) - provider.add_allowed_domains(allowed_host) - provider.add_allowed_http_methods("GET") + provider.add_allowed_domains((allowed_host, "GET")) context = _FakeSessionContext() state: dict[str, Any] = {}