diff --git a/embodichain/gen_sim/prompt2scene/.gitignore b/embodichain/gen_sim/prompt2scene/.gitignore
new file mode 100644
index 000000000..75f4908e8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/.gitignore
@@ -0,0 +1,7 @@
+cli/preview*
+cli/export*
+agent_tools/servers/geometry_generation_server/*
+
+# Python cache
+__pycache__/
+*.py[cod]
diff --git a/embodichain/gen_sim/prompt2scene/__init__.py b/embodichain/gen_sim/prompt2scene/__init__.py
new file mode 100644
index 000000000..01ece10d4
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/__init__.py
@@ -0,0 +1,15 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py
new file mode 100644
index 000000000..a4b11ff06
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/__init__.py
@@ -0,0 +1 @@
+"""Internal client + External server for agent tool calling."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py
new file mode 100644
index 000000000..3afc32bd0
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+ load_client_config,
+)
+
+__all__ = [
+ "BaseHttpClient",
+ "ClientError",
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "load_client_config",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py
new file mode 100644
index 000000000..8981602f6
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/base.py
@@ -0,0 +1,131 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import time
+from pathlib import Path
+from typing import Callable
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ ClientError,
+ build_client_error,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ load_client_config,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import (
+ log_api_request_start,
+ log_info,
+ log_warning,
+)
+
+__all__ = ["BaseHttpClient"]
+
+
+class BaseHttpClient:
+ """Shared HTTP client behavior for agent-tool service clients."""
+
+ def __init__(
+ self,
+ *,
+ config_key: str,
+ server_name: str,
+ base_url: str | None = None,
+ timeout_s: int | None = None,
+ config_path: Path | None = None,
+ session: requests.Session | None = None,
+ trust_env: bool = True,
+ ) -> None:
+ """Initialize common service client fields from config."""
+ self.config = load_client_config(config_key, config_path)
+ self.server_name = server_name
+ self.base_url = (base_url or str(self.config["base_url"])).rstrip("/")
+ self.timeout_s = int(timeout_s or self.config.get("timeout_s", 120))
+ self.health_path = str(self.config.get("health_path", "/health"))
+ self.session = session or requests.Session()
+ self.session.trust_env = trust_env
+ log_info(f"{self.server_name} client initialized for {self.base_url}")
+
+ def health_check(self) -> bool:
+ """Check whether the configured service is healthy."""
+ try:
+ response = self.session.get(
+ f"{self.base_url}{self.health_path}",
+ timeout=5,
+ )
+ response.raise_for_status()
+ return True
+ except Exception as exc:
+ log_warning(f"{self.server_name} health check failed: {exc}")
+ return False
+
+ def post_with_retries(
+ self,
+ request_fn: Callable[[], requests.Response],
+ *,
+ max_retries: int,
+ error_cls: type[ClientError] = ClientError,
+ request_label: str | None = None,
+ ) -> requests.Response | ClientError:
+ """Run a POST request function with retry and HTTP error handling."""
+ for attempt in range(max_retries):
+ try:
+ if request_label is not None:
+ log_api_request_start(
+ step=self.server_name,
+ request=request_label,
+ attempt=attempt + 1,
+ )
+ response = request_fn()
+ response.raise_for_status()
+ return response
+
+ except requests.exceptions.ConnectionError as exc:
+ if attempt < max_retries - 1:
+ log_warning(
+ f"{self.server_name} connection failed; retrying "
+ f"({attempt + 1}/{max_retries})."
+ )
+ time.sleep(min(2**attempt, 60))
+ continue
+ raise ConnectionError(
+ f"Failed to connect to {self.server_name} at {self.base_url}"
+ ) from exc
+
+ except requests.exceptions.HTTPError as exc:
+ response = exc.response
+ if response is None:
+ raise RuntimeError(f"{self.server_name} HTTP request failed.") from exc
+ if response.status_code >= 500 and attempt < max_retries - 1:
+ log_warning(
+ f"{self.server_name} server error; retrying "
+ f"({attempt + 1}/{max_retries})."
+ )
+ time.sleep(min(2**attempt, 60))
+ continue
+ return build_client_error(
+ response,
+ server_name=self.server_name,
+ error_cls=error_cls,
+ )
+
+ except requests.exceptions.Timeout as exc:
+ raise TimeoutError(f"{self.server_name} request timed out.") from exc
+
+ raise RuntimeError(f"{self.server_name} request failed unexpectedly.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py
new file mode 100644
index 000000000..f1c7dc690
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/common.py
@@ -0,0 +1,139 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import requests
+
+__all__ = [
+ "ClientError",
+ "build_client_error",
+ "first_string",
+ "format_http_error",
+ "parse_error_response",
+ "parse_json_object_response",
+ "validate_required_strings",
+ "validate_png_response",
+]
+
+
+@dataclass(frozen=True)
+class ClientError:
+ """Common HTTP client error response."""
+
+ error_message: str
+ status_code: int | None = None
+ content_type: str | None = None
+ headers: dict[str, str] = field(default_factory=dict)
+ raw_response: dict[str, Any] | None = None
+
+
+def validate_png_response(
+ response: requests.Response,
+ png_bytes: bytes,
+) -> None:
+ content_type = response.headers.get("Content-Type", "")
+ if "image/png" not in content_type.lower():
+ raise RuntimeError(
+ "Image generation server returned non-PNG content: "
+ f"{content_type or 'unknown'}"
+ )
+ if not png_bytes.startswith(b"\x89PNG\r\n\x1a\n"):
+ raise RuntimeError("Image generation server returned invalid PNG bytes.")
+
+
+def validate_required_strings(fields: dict[str, object]) -> None:
+ """Validate required client request string fields."""
+ for field_name, value in fields.items():
+ if not str(value).strip():
+ raise ValueError(f"{field_name} must be non-empty.")
+
+
+def format_http_error(response: requests.Response, *, server_name: str) -> str:
+ """Format an HTTP error response from an agent-tool server."""
+ try:
+ response_data = response.json()
+ except ValueError:
+ return f"{server_name} HTTP error: {response.status_code}"
+
+ error_message = first_string(
+ response_data,
+ "error",
+ "error_message",
+ "message",
+ "detail",
+ )
+ if error_message:
+ return f"{server_name} error: {error_message}"
+ return f"{server_name} HTTP error: {response.status_code}"
+
+
+def parse_error_response(response: requests.Response) -> dict[str, Any] | None:
+ """Parse an error response body as a JSON object if possible."""
+ try:
+ response_data = response.json()
+ except ValueError:
+ return None
+ return response_data if isinstance(response_data, dict) else None
+
+
+def build_client_error(
+ response: requests.Response,
+ *,
+ server_name: str,
+ error_cls: type[ClientError] = ClientError,
+) -> ClientError:
+ """Build a common client error dataclass from an HTTP response."""
+ return error_cls(
+ error_message=format_http_error(
+ response,
+ server_name=server_name,
+ ),
+ status_code=response.status_code,
+ content_type=response.headers.get("Content-Type"),
+ headers=dict(response.headers),
+ raw_response=parse_error_response(response),
+ )
+
+
+def parse_json_object_response(
+ response: requests.Response,
+ *,
+ server_name: str,
+) -> dict[str, Any]:
+ """Parse an HTTP response body as a JSON object."""
+ try:
+ response_data = response.json()
+ except ValueError as exc:
+ raise RuntimeError(
+ f"{server_name} returned invalid JSON content: "
+ f"{response.headers.get('Content-Type') or 'unknown'}"
+ ) from exc
+ if not isinstance(response_data, dict):
+ raise RuntimeError(f"{server_name} response must be a JSON object.")
+ return response_data
+
+
+def first_string(data: dict[str, Any], *keys: str) -> str | None:
+ """Return the first string value for the given keys."""
+ for key in keys:
+ value = data.get(key)
+ if isinstance(value, str):
+ return value
+ return None
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py
new file mode 100644
index 000000000..5592806a8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/config.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+__all__ = ["DEFAULT_CLIENT_CONFIG_PATH", "load_client_config"]
+
+DEFAULT_CLIENT_CONFIG_PATH = (
+ Path(__file__).resolve().parents[2] / "configs" / "client_config.json"
+)
+
+
+def load_client_config(
+ config_key: str,
+ config_path: Path | None = None,
+) -> dict[str, Any]:
+ """Load one agent-tool client config section."""
+ resolved_config_path = (config_path or DEFAULT_CLIENT_CONFIG_PATH).resolve()
+ if not resolved_config_path.is_file():
+ raise FileNotFoundError(f"Client config not found: {resolved_config_path}")
+
+ with resolved_config_path.open("r", encoding="utf-8") as f:
+ raw_config = json.load(f)
+
+ config = raw_config.get(config_key)
+ if not isinstance(config, dict):
+ raise ValueError(
+ f"Client config section {config_key!r} not found in "
+ f"{resolved_config_path}"
+ )
+ if not config.get("base_url"):
+ raise ValueError(f"Client config section {config_key!r} requires base_url.")
+ return config
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py
new file mode 100644
index 000000000..3fa63f3b1
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/__init__.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.client import (
+ GeometryGenerationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+ GeometryGenerationError,
+ GeometryGenerationResult,
+ GeometryGenerationServerRequest,
+ GeometryGenerationServerResponse,
+ MultiObjectGenerationError,
+ MultiObjectGenerationObject,
+ MultiObjectGenerationResult,
+ MultiObjectGenerationServerRequest,
+ MultiObjectGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "GeometryGenerationClient",
+ "GeometryGenerationError",
+ "GeometryGenerationResult",
+ "GeometryGenerationServerRequest",
+ "GeometryGenerationServerResponse",
+ "MultiObjectGenerationError",
+ "MultiObjectGenerationObject",
+ "MultiObjectGenerationResult",
+ "MultiObjectGenerationServerRequest",
+ "MultiObjectGenerationServerResponse",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py
new file mode 100644
index 000000000..0615c6d27
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/client.py
@@ -0,0 +1,213 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the SAM3D geometry generation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.parser import (
+ parse_geometry_generation_response,
+ parse_multi_object_generation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+ GeometryGenerationError,
+ GeometryGenerationResult,
+ GeometryGenerationServerRequest,
+ GeometryGenerationServerResponse,
+ MultiObjectGenerationError,
+ MultiObjectGenerationObject,
+ MultiObjectGenerationServerRequest,
+ MultiObjectGenerationServerResponse,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "GeometryGenerationClient",
+ "GeometryGenerationError",
+ "GeometryGenerationResult",
+ "GeometryGenerationServerRequest",
+ "GeometryGenerationServerResponse",
+ "MultiObjectGenerationError",
+ "MultiObjectGenerationObject",
+ "MultiObjectGenerationServerRequest",
+ "MultiObjectGenerationServerResponse",
+]
+
+
+class GeometryGenerationClient(BaseHttpClient):
+ """Client for making single-object SAM3D geometry generation requests."""
+
+ def __init__(
+ self,
+ *,
+ base_url: str | None = None,
+ timeout_s: int | None = None,
+ config_path: Path | None = None,
+ config_key: str = "sam3d_generation",
+ session: requests.Session | None = None,
+ ) -> None:
+ """Initialize the geometry generation client."""
+ super().__init__(
+ config_key=config_key,
+ server_name="Geometry generation server",
+ base_url=base_url,
+ timeout_s=timeout_s,
+ config_path=config_path,
+ session=session,
+ trust_env=False,
+ )
+ self.generate_single_object_path = str(
+ self.config.get("generate_single_object_path", "/generate_single_object")
+ )
+ self.generate_multiple_objects_path = str(
+ self.config.get(
+ "generate_multiple_objects_path", "/generate_multiple_objects"
+ )
+ )
+
+ def generate(
+ self,
+ request: GeometryGenerationServerRequest,
+ *,
+ max_retries: int = 3,
+ ) -> GeometryGenerationServerResponse | GeometryGenerationError:
+ """Generate one GLB mesh from an object image and save it locally."""
+ _validate_request(request)
+ url = f"{self.base_url}{self.generate_single_object_path}"
+ response = self.post_with_retries(
+ lambda: _post_geometry_generation_request(self, url, request),
+ max_retries=max_retries,
+ error_cls=GeometryGenerationError,
+ request_label="geometry_generation",
+ )
+ if isinstance(response, GeometryGenerationError):
+ return response
+ return parse_geometry_generation_response(response, request)
+
+ def generate_multiple_objects(
+ self,
+ request: MultiObjectGenerationServerRequest,
+ *,
+ output_dir: Path | None = None,
+ max_retries: int = 3,
+ ) -> MultiObjectGenerationServerResponse | MultiObjectGenerationError:
+ """Generate multiple GLB meshes from one image and multiple masks."""
+ _validate_multi_object_request(request)
+ url = f"{self.base_url}{self.generate_multiple_objects_path}"
+ response = self.post_with_retries(
+ lambda: _post_multi_object_generation_request(self, url, request),
+ max_retries=max_retries,
+ error_cls=MultiObjectGenerationError,
+ request_label="multi_object_geometry_generation",
+ )
+ if isinstance(response, MultiObjectGenerationError):
+ return response
+ return parse_multi_object_generation_response(
+ response,
+ self.base_url,
+ output_dir=output_dir,
+ session=self.session,
+ )
+
+
+def _validate_request(request: GeometryGenerationServerRequest) -> None:
+ validate_required_strings(
+ {
+ "Geometry generation image_path": request.image_path,
+ "Geometry generation output_path": request.output_path,
+ }
+ )
+ image_path = Path(request.image_path).expanduser()
+ if not image_path.is_file():
+ raise FileNotFoundError(f"Geometry generation input not found: {image_path}")
+ if not str(request.output_path).lower().endswith(".glb"):
+ raise ValueError("Geometry generation output_path must be a GLB file path.")
+
+
+def _post_geometry_generation_request(
+ client: GeometryGenerationClient,
+ url: str,
+ request: GeometryGenerationServerRequest,
+) -> requests.Response:
+ with _open_image_file(request.image_path) as image_file:
+ return client.session.post(
+ url,
+ data=request.to_form_data(),
+ files={
+ "image": (
+ Path(request.image_path).name,
+ image_file,
+ )
+ },
+ timeout=(10, client.timeout_s),
+ )
+
+
+def _open_image_file(image_path: str | Path) -> Any:
+ return Path(image_path).expanduser().resolve().open("rb")
+
+
+def _validate_multi_object_request(
+ request: MultiObjectGenerationServerRequest,
+) -> None:
+ validate_required_strings(
+ {"Multi-object geometry generation image_path": request.image_path}
+ )
+ image_path = Path(request.image_path).expanduser()
+ if not image_path.is_file():
+ raise FileNotFoundError(
+ f"Multi-object geometry generation input not found: {image_path}"
+ )
+ if not request.mask_paths:
+ raise ValueError("mask_paths must be non-empty.")
+ for mask_path in request.mask_paths:
+ if not Path(mask_path).expanduser().is_file():
+ raise FileNotFoundError(
+ f"Multi-object geometry mask not found: {mask_path}"
+ )
+
+
+def _post_multi_object_generation_request(
+ client: GeometryGenerationClient,
+ url: str,
+ request: MultiObjectGenerationServerRequest,
+) -> requests.Response:
+ mask_files = [
+ ("masks", (Path(p).name, Path(p).expanduser().resolve().open("rb")))
+ for p in request.mask_paths
+ ]
+ try:
+ return client.session.post(
+ url,
+ data=request.to_form_data(),
+ files=[("image", (Path(request.image_path).name, _open_image_file(request.image_path)))] + mask_files,
+ timeout=(10, client.timeout_s),
+ )
+ finally:
+ for _, (_, f) in mask_files:
+ f.close()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py
new file mode 100644
index 000000000..4d3c09671
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/parser.py
@@ -0,0 +1,255 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client.schemas import (
+ GeometryGenerationResult,
+ GeometryGenerationServerRequest,
+ GeometryGenerationServerResponse,
+ MultiObjectGenerationObject,
+ MultiObjectGenerationResult,
+ MultiObjectGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["parse_geometry_generation_response", "parse_multi_object_generation_response"]
+
+
+def parse_geometry_generation_response(
+ response: requests.Response,
+ request: GeometryGenerationServerRequest,
+) -> GeometryGenerationServerResponse:
+ """Parse a geometry GLB response and save it to the request output path."""
+ glb_bytes = response.content
+ _validate_glb_response(response, glb_bytes)
+ output_path = _write_glb_output(request, glb_bytes)
+ result = GeometryGenerationResult(geometry_path=str(output_path))
+ return GeometryGenerationServerResponse(
+ ok=True,
+ status="ok",
+ result=result,
+ status_code=response.status_code,
+ content_type=response.headers.get("Content-Type"),
+ headers=dict(response.headers),
+ )
+
+
+def _validate_glb_response(
+ response: requests.Response,
+ glb_bytes: bytes,
+) -> None:
+ if not glb_bytes.startswith(b"glTF"):
+ content_type = response.headers.get("Content-Type", "")
+ raise RuntimeError(
+ "Geometry generation server returned invalid GLB content: "
+ f"{content_type or 'unknown'}"
+ )
+
+
+def _write_glb_output(
+ request: GeometryGenerationServerRequest,
+ glb_bytes: bytes,
+) -> Path:
+ output_path = Path(request.output_path).expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_bytes(glb_bytes)
+ if not output_path.is_file():
+ raise FileNotFoundError(f"Generated geometry was not written: {output_path}")
+ log_info(f"Generated geometry written: {output_path}")
+ return output_path
+
+
+def parse_multi_object_generation_response(
+ response: requests.Response,
+ base_url: str,
+ *,
+ output_dir: Path | None = None,
+ session: requests.Session | None = None,
+) -> MultiObjectGenerationServerResponse:
+ """Parse a multi-object geometry response, download GLBs if output_dir given."""
+ body = _parse_json_body(response)
+ ok = body.get("ok", False)
+ if not isinstance(ok, bool) or not ok:
+ error_msg = body.get("error", "ok is not true")
+ raise RuntimeError(
+ f"Multi-object geometry generation failed: {error_msg}"
+ )
+
+ result_data = body.get("result")
+ if not isinstance(result_data, dict):
+ raise RuntimeError(
+ "Multi-object geometry generation response missing 'result' object"
+ )
+ base = base_url.rstrip("/")
+ objects = _parse_multi_object_items(
+ result_data,
+ base,
+ output_dir=output_dir,
+ session=session,
+ )
+
+ return MultiObjectGenerationServerResponse(
+ ok=True,
+ status=str(body.get("status") or "ok"),
+ result=MultiObjectGenerationResult(objects=objects),
+ status_code=response.status_code,
+ content_type=response.headers.get("Content-Type"),
+ headers=dict(response.headers),
+ )
+
+
+def _parse_multi_object_items(
+ body: dict[str, object],
+ base_url: str,
+ *,
+ output_dir: Path | None,
+ session: requests.Session | None,
+) -> list[MultiObjectGenerationObject]:
+ response_objects = body.get("objects")
+ if not isinstance(response_objects, list) or not response_objects:
+ raise RuntimeError(
+ "Multi-object geometry generation response missing 'result.objects' list"
+ )
+ return [
+ _parse_multi_object_item(
+ item,
+ index=i,
+ base_url=base_url,
+ output_dir=output_dir,
+ session=session,
+ )
+ for i, item in enumerate(response_objects)
+ ]
+
+
+def _parse_multi_object_item(
+ item: object,
+ *,
+ index: int,
+ base_url: str,
+ output_dir: Path | None,
+ session: requests.Session | None,
+) -> MultiObjectGenerationObject:
+ if not isinstance(item, dict):
+ raise RuntimeError(f"Multi-object item {index} must be a JSON object")
+
+ mesh_rel_path = item.get("mesh")
+ if not isinstance(mesh_rel_path, str) or not mesh_rel_path:
+ raise RuntimeError(f"Multi-object item {index} missing 'mesh'")
+
+ name = str(item.get("name") or Path(mesh_rel_path).stem or index)
+ geometry_path = _resolve_or_download_glb(
+ base_url,
+ mesh_rel_path,
+ name=name,
+ index=index,
+ output_dir=output_dir,
+ session=session,
+ )
+
+ return MultiObjectGenerationObject(
+ name=name,
+ geometry_path=geometry_path,
+ rotation_quaternion_wxyz=_float_list(
+ item.get("rotation_quaternion_wxyz"),
+ expected_len=4,
+ field_name=f"objects[{index}].rotation_quaternion_wxyz",
+ ),
+ translation=_float_list(
+ item.get("translation"),
+ expected_len=3,
+ field_name=f"objects[{index}].translation",
+ ),
+ scale=_float_list(
+ item.get("scale"),
+ expected_len=3,
+ field_name=f"objects[{index}].scale",
+ ),
+ )
+
+
+def _resolve_or_download_glb(
+ base_url: str,
+ mesh_rel_path: str,
+ *,
+ name: str,
+ index: int,
+ output_dir: Path | None,
+ session: requests.Session | None,
+) -> str:
+ url = _join_url(base_url, mesh_rel_path)
+ if output_dir is None:
+ return url
+
+ output_dir = output_dir.expanduser().resolve()
+ output_dir.mkdir(parents=True, exist_ok=True)
+ filename = f"{name}.glb" if name else f"{index}.glb"
+ dest = output_dir / filename
+ _download_glb(url, dest, session=session)
+ return str(dest)
+
+
+def _join_url(base_url: str, path_or_url: str) -> str:
+ if path_or_url.startswith(("http://", "https://")):
+ return path_or_url
+ if path_or_url.startswith("/"):
+ return f"{base_url}{path_or_url}"
+ return f"{base_url}/{path_or_url}"
+
+
+def _float_list(value: object, *, expected_len: int, field_name: str) -> list[float]:
+ if not isinstance(value, list) or len(value) != expected_len:
+ raise RuntimeError(f"Multi-object geometry response missing '{field_name}'")
+ try:
+ return [float(v) for v in value]
+ except (TypeError, ValueError) as exc:
+ raise RuntimeError(
+ f"Multi-object geometry response field '{field_name}' must be numeric"
+ ) from exc
+
+
+def _parse_json_body(response: requests.Response) -> dict[str, object]:
+ try:
+ body = response.json()
+ except ValueError as exc:
+ raise RuntimeError(
+ "Multi-object geometry generation server returned invalid JSON"
+ ) from exc
+ if not isinstance(body, dict):
+ raise RuntimeError(
+ "Multi-object geometry generation response must be a JSON object"
+ )
+ return body
+
+
+def _download_glb(
+ url: str,
+ dest: Path,
+ *,
+ session: requests.Session | None,
+) -> None:
+ """Download a GLB from the geometry server."""
+ http = session or requests.Session()
+ r = http.get(url, timeout=30)
+ r.raise_for_status()
+ _validate_glb_response(r, r.content)
+ dest.write_bytes(r.content)
+ log_info(f"Generated geometry written: {dest}")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py
new file mode 100644
index 000000000..d8ede9eea
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/geometry_generation_client/schemas.py
@@ -0,0 +1,134 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+ "GeometryGenerationError",
+ "GeometryGenerationResult",
+ "GeometryGenerationServerRequest",
+ "GeometryGenerationServerResponse",
+ "MultiObjectGenerationError",
+ "MultiObjectGenerationObject",
+ "MultiObjectGenerationResult",
+ "MultiObjectGenerationServerRequest",
+ "MultiObjectGenerationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class GeometryGenerationServerRequest:
+ """Request sent to the Geometry Generation server.
+
+ Args:
+ image_path: Local object image path.
+ output_path: Local output GLB path where the client saves the generated geometry.
+ """
+
+ image_path: str | Path
+ output_path: str | Path
+
+ def to_form_data(self) -> dict[str, str]:
+ """Convert the request to the geometry server multipart form fields."""
+ return {}
+
+
+@dataclass(frozen=True)
+class GeometryGenerationResult:
+ """Successful Geometry Generation result."""
+
+ geometry_path: str
+
+
+@dataclass(frozen=True)
+class GeometryGenerationServerResponse:
+ """Parsed successful response from the Geometry Generation server."""
+
+ ok: bool
+ result: GeometryGenerationResult
+ status: str | None = None
+ error: str | None = None
+ status_code: int | None = None
+ content_type: str | None = None
+ headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class GeometryGenerationError(ClientError):
+ """Geometry generation failure returned by the server."""
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationServerRequest:
+ """Request sent to the Geometry Generation server (multi-object).
+
+ Args:
+ image_path: Local scene RGB image path.
+ mask_paths: Local mask PNG file paths (one per object).
+ """
+
+ image_path: str | Path
+ mask_paths: list[Path]
+
+ def to_form_data(self) -> dict[str, str]:
+ """Convert the request to the geometry server multipart form fields."""
+ return {"json": "1"}
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationObject:
+ """Successful Multi-Object Geometry Generation result."""
+
+ name: str
+ geometry_path: str
+ rotation_quaternion_wxyz: list[float]
+ translation: list[float]
+ scale: list[float]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationResult:
+ """Successful Multi-Object Geometry Generation result."""
+
+ objects: list[MultiObjectGenerationObject]
+
+ @property
+ def geometry_paths(self) -> list[str]:
+ """Paths to the generated GLB files."""
+ return [item.geometry_path for item in self.objects]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationServerResponse:
+ """Parsed successful response from the Geometry Generation server."""
+
+ ok: bool
+ result: MultiObjectGenerationResult
+ status: str | None = None
+ error: str | None = None
+ status_code: int | None = None
+ content_type: str | None = None
+ headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationError(ClientError):
+ """Multi-object geometry generation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py
new file mode 100644
index 000000000..c112bd3d7
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/__init__.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.client import (
+ ImageGenerationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+ ImageGenerationError,
+ ImageGenerationResult,
+ ImageGenerationServerRequest,
+ ImageGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "ImageGenerationClient",
+ "ImageGenerationError",
+ "ImageGenerationResult",
+ "ImageGenerationServerRequest",
+ "ImageGenerationServerResponse",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py
new file mode 100644
index 000000000..6f23d47bd
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/client.py
@@ -0,0 +1,117 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the Z-Image image generation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.parser import (
+ parse_generation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+ ImageGenerationError,
+ ImageGenerationResult,
+ ImageGenerationServerRequest,
+ ImageGenerationServerResponse,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "ImageGenerationClient",
+ "ImageGenerationError",
+ "ImageGenerationResult",
+ "ImageGenerationServerRequest",
+ "ImageGenerationServerResponse",
+]
+
+
+class ImageGenerationClient(BaseHttpClient):
+ """Client for making single-image Z-Image generation requests."""
+
+ def __init__(
+ self,
+ *,
+ base_url: str | None = None,
+ timeout_s: int | None = None,
+ config_path: Path | None = None,
+ config_key: str = "zimage",
+ session: requests.Session | None = None,
+ ) -> None:
+ """Initialize the image generation client."""
+ super().__init__(
+ config_key=config_key,
+ server_name="Image generation server",
+ base_url=base_url,
+ timeout_s=timeout_s,
+ config_path=config_path,
+ session=session,
+ )
+ self.generate_single_object_path = str(
+ self.config.get("generate_single_object_path", "/generate.png")
+ )
+
+ def generate(
+ self,
+ request: ImageGenerationServerRequest,
+ *,
+ max_retries: int = 3,
+ ) -> ImageGenerationServerResponse | ImageGenerationError:
+ """Generate one image and save the returned PNG locally."""
+ _validate_request(request)
+ url = f"{self.base_url}{self.generate_single_object_path}"
+ response = self.post_with_retries(
+ lambda: _post_generation_request(self, url, request),
+ max_retries=max_retries,
+ error_cls=ImageGenerationError,
+ request_label="image_generation",
+ )
+ if isinstance(response, ImageGenerationError):
+ return response
+ return parse_generation_response(response, request)
+
+
+def _validate_request(request: ImageGenerationServerRequest) -> None:
+ validate_required_strings(
+ {
+ "Image generation prompt": request.prompt,
+ "Image generation output_path": request.output_path,
+ }
+ )
+ if not str(request.output_path).lower().endswith(".png"):
+ raise ValueError("Image generation output_path must be a PNG file path.")
+
+
+def _post_generation_request(
+ client: ImageGenerationClient,
+ url: str,
+ request: ImageGenerationServerRequest,
+) -> requests.Response:
+ return client.session.post(
+ url,
+ json=request.to_dict(),
+ timeout=(10, client.timeout_s),
+ )
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py
new file mode 100644
index 000000000..a43ee0307
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/parser.py
@@ -0,0 +1,65 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ validate_png_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client.schemas import (
+ ImageGenerationResult,
+ ImageGenerationServerRequest,
+ ImageGenerationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["parse_generation_response"]
+
+
+def parse_generation_response(
+ response: requests.Response,
+ request: ImageGenerationServerRequest,
+) -> ImageGenerationServerResponse:
+ """Parse a Z-Image PNG response and save it to the request output path."""
+ png_bytes = response.content
+ validate_png_response(response, png_bytes)
+ output_path = _write_png_output(request, png_bytes)
+ result = ImageGenerationResult(image_path=str(output_path))
+ return ImageGenerationServerResponse(
+ ok=True,
+ status="ok",
+ result=result,
+ status_code=response.status_code,
+ content_type=response.headers.get("Content-Type"),
+ headers=dict(response.headers),
+ )
+
+
+def _write_png_output(
+ request: ImageGenerationServerRequest,
+ png_bytes: bytes,
+) -> Path:
+ output_path = Path(request.output_path).expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_bytes(png_bytes)
+ if not output_path.is_file():
+ raise FileNotFoundError(f"Generated image was not written: {output_path}")
+ log_info(f"Generated image written: {output_path}")
+ return output_path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py
new file mode 100644
index 000000000..09c845bac
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_generation_client/schemas.py
@@ -0,0 +1,72 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+ "ImageGenerationError",
+ "ImageGenerationResult",
+ "ImageGenerationServerRequest",
+ "ImageGenerationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class ImageGenerationServerRequest:
+ """Request sent to the Z-Image server.
+
+ Args:
+ prompt: Text prompt used to generate the image.
+ output_path: Local output PNG path where the client saves the response.
+ """
+
+ prompt: str
+ output_path: str | Path
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert the request to the Z-Image server JSON payload."""
+ return {"prompt": self.prompt}
+
+
+@dataclass(frozen=True)
+class ImageGenerationResult:
+ """Successful Z-Image generation result."""
+
+ image_path: str
+
+
+@dataclass(frozen=True)
+class ImageGenerationServerResponse:
+ """Parsed successful response from the Z-Image server."""
+
+ ok: bool
+ result: ImageGenerationResult
+ status: str | None = None
+ error: str | None = None
+ status_code: int | None = None
+ content_type: str | None = None
+ headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageGenerationError(ClientError):
+ """Image generation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py
new file mode 100644
index 000000000..a503f2875
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/__init__.py
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.client import (
+ ImageSegmentationClient,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+ ImageSegmentationCandidate,
+ ImageSegmentationError,
+ ImageSegmentationResult,
+ ImageSegmentationServerRequest,
+ ImageSegmentationServerResponse,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.utils import (
+ apply_mask_to_alpha,
+ bbox_iou,
+ decode_rle_mask,
+ draw_labeled_bboxes,
+ draw_numbered_bboxes,
+ draw_numbered_masks,
+ is_usable_segmentation_candidate,
+ save_candidate_rgba_and_mask,
+ sort_segments_by_bbox,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "ImageSegmentationCandidate",
+ "ImageSegmentationClient",
+ "ImageSegmentationError",
+ "ImageSegmentationResult",
+ "ImageSegmentationServerRequest",
+ "ImageSegmentationServerResponse",
+ "apply_mask_to_alpha",
+ "bbox_iou",
+ "decode_rle_mask",
+ "draw_labeled_bboxes",
+ "draw_numbered_bboxes",
+ "draw_numbered_masks",
+ "is_usable_segmentation_candidate",
+ "save_candidate_rgba_and_mask",
+ "sort_segments_by_bbox",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py
new file mode 100644
index 000000000..1a880bb62
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/client.py
@@ -0,0 +1,132 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Client for the SAM3 image segmentation server."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.base import BaseHttpClient
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ validate_required_strings,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.config import (
+ DEFAULT_CLIENT_CONFIG_PATH,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.parser import (
+ parse_segmentation_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+ ImageSegmentationCandidate,
+ ImageSegmentationError,
+ ImageSegmentationResult,
+ ImageSegmentationServerRequest,
+ ImageSegmentationServerResponse,
+)
+
+__all__ = [
+ "DEFAULT_CLIENT_CONFIG_PATH",
+ "ImageSegmentationCandidate",
+ "ImageSegmentationClient",
+ "ImageSegmentationError",
+ "ImageSegmentationResult",
+ "ImageSegmentationServerRequest",
+ "ImageSegmentationServerResponse",
+]
+
+
+class ImageSegmentationClient(BaseHttpClient):
+ """Client for making single-image SAM3 segmentation requests."""
+
+ def __init__(
+ self,
+ *,
+ base_url: str | None = None,
+ timeout_s: int | None = None,
+ config_path: Path | None = None,
+ config_key: str = "sam3_segmentation",
+ session: requests.Session | None = None,
+ ) -> None:
+ """Initialize the image segmentation client."""
+ super().__init__(
+ config_key=config_key,
+ server_name="Image segmentation server",
+ base_url=base_url,
+ timeout_s=timeout_s,
+ config_path=config_path,
+ session=session,
+ trust_env=False,
+ )
+ self.segmentation_path = str(
+ self.config.get("segment_single_object_path", "/segment_single_object")
+ )
+
+ def segment(
+ self,
+ request: ImageSegmentationServerRequest,
+ *,
+ max_retries: int = 3,
+ ) -> ImageSegmentationServerResponse | ImageSegmentationError:
+ """Segment one image with a text prompt."""
+ _validate_request(request)
+ url = f"{self.base_url}{self.segmentation_path}"
+ response = self.post_with_retries(
+ lambda: _post_segmentation_request(self, url, request),
+ max_retries=max_retries,
+ error_cls=ImageSegmentationError,
+ request_label="image_segmentation",
+ )
+ if isinstance(response, ImageSegmentationError):
+ return response
+ return parse_segmentation_response(response, request)
+
+
+def _validate_request(request: ImageSegmentationServerRequest) -> None:
+ validate_required_strings(
+ {
+ "Image segmentation image_path": request.image_path,
+ }
+ )
+ image_path = Path(request.image_path).expanduser()
+ if not image_path.is_file():
+ raise FileNotFoundError(f"Image segmentation input not found: {image_path}")
+
+
+def _post_segmentation_request(
+ client: ImageSegmentationClient,
+ url: str,
+ request: ImageSegmentationServerRequest,
+) -> requests.Response:
+ with _open_image_file(request.image_path) as image_file:
+ return client.session.post(
+ url,
+ data=request.to_form_data(),
+ files={
+ "image": (
+ Path(request.image_path).name,
+ image_file,
+ )
+ },
+ timeout=(10, client.timeout_s),
+ )
+
+
+def _open_image_file(image_path: str | Path) -> Any:
+ return Path(image_path).expanduser().resolve().open("rb")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py
new file mode 100644
index 000000000..762a1b43c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/parser.py
@@ -0,0 +1,218 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import requests
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import (
+ parse_json_object_response,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+ ImageSegmentationCandidate,
+ ImageSegmentationResult,
+ ImageSegmentationServerRequest,
+ ImageSegmentationServerResponse,
+)
+
+__all__ = ["parse_segmentation_response"]
+
+SERVER_NAME = "Image segmentation server"
+
+
+def parse_segmentation_response(
+ response: requests.Response,
+ request: ImageSegmentationServerRequest,
+) -> ImageSegmentationServerResponse:
+ """Parse a SAM3 server JSON response into typed segmentation records."""
+ response_data = parse_json_object_response(
+ response,
+ server_name=SERVER_NAME,
+ )
+ result = _parse_segmentation_result(response_data, request)
+ return ImageSegmentationServerResponse(
+ ok=bool(response_data.get("ok", True)),
+ status=_string_or_none(response_data.get("status")) or "ok",
+ result=result,
+ status_code=response.status_code,
+ content_type=response.headers.get("Content-Type"),
+ headers=dict(response.headers),
+ )
+
+
+def _parse_segmentation_result(
+ response_data: dict[str, Any],
+ request: ImageSegmentationServerRequest,
+) -> ImageSegmentationResult:
+ result_data = response_data.get("result")
+ if not isinstance(result_data, dict):
+ result_data = response_data.get("data")
+ if not isinstance(result_data, dict):
+ result_data = response_data
+
+ return ImageSegmentationResult(
+ image_path=_string_or_none(result_data.get("image_path"))
+ or str(request.image_path),
+ prompt=_string_or_none(result_data.get("prompt")) or request.prompt,
+ candidates=_parse_candidates(result_data),
+ request_id=_string_or_none(result_data.get("request_id")),
+ elapsed_sec=_float_or_none(result_data.get("elapsed_sec")),
+ count=_int_or_none(result_data.get("count")),
+ image_width=_parse_image_width(result_data),
+ image_height=_parse_image_height(result_data),
+ box_format=_string_or_none(result_data.get("box_format")) or "xyxy",
+ mask_format=_string_or_none(result_data.get("mask_format")) or "rle",
+ )
+
+
+def _parse_candidates(result_data: dict[str, Any]) -> list[ImageSegmentationCandidate]:
+ for key in ("instances", "candidates", "segmentations", "detections"):
+ items = result_data.get(key)
+ if isinstance(items, list):
+ return [
+ _parse_candidate_item(item, index)
+ for index, item in enumerate(items)
+ if isinstance(item, dict)
+ ]
+
+ boxes = result_data.get("boxes", [])
+ scores = result_data.get("scores", [])
+ masks = result_data.get("masks", [])
+ if not isinstance(boxes, list):
+ return []
+
+ candidates: list[ImageSegmentationCandidate] = []
+ for index, box in enumerate(boxes):
+ candidates.append(
+ ImageSegmentationCandidate(
+ candidate_id=f"candidate_{index}",
+ bbox_xyxy=_float_list(box),
+ score=_float_or_zero(_list_get(scores, index)),
+ mask_rle=_mask_or_none(_list_get(masks, index)),
+ )
+ )
+ return candidates
+
+
+def _parse_candidate_item(
+ item: dict[str, Any],
+ index: int,
+) -> ImageSegmentationCandidate:
+ known_keys = {
+ "candidate_id",
+ "id",
+ "index",
+ "bbox_xyxy",
+ "box_xyxy",
+ "box",
+ "bbox",
+ "score",
+ "mask_rle",
+ "mask",
+ "segmentation",
+ "mask_path",
+ "label",
+ }
+ mask_value = item.get("mask_rle") or item.get("mask") or item.get("segmentation")
+ return ImageSegmentationCandidate(
+ candidate_id=_string_or_none(item.get("candidate_id"))
+ or _string_or_none(item.get("id"))
+ or _index_id_or_none(item.get("index"))
+ or f"candidate_{index}",
+ bbox_xyxy=_float_list(
+ item.get("bbox_xyxy")
+ or item.get("box_xyxy")
+ or item.get("box")
+ or item.get("bbox")
+ ),
+ score=_float_or_zero(item.get("score")),
+ mask_rle=_mask_or_none(mask_value),
+ mask_path=_string_or_none(item.get("mask_path")),
+ label=_string_or_none(item.get("label")),
+ metadata={k: v for k, v in item.items() if k not in known_keys},
+ )
+
+
+def _list_get(values: Any, index: int) -> Any:
+ if not isinstance(values, list) or index >= len(values):
+ return None
+ return values[index]
+
+
+def _float_list(value: Any) -> list[float]:
+ if not isinstance(value, list):
+ return []
+ parsed: list[float] = []
+ for item in value:
+ try:
+ parsed.append(float(item))
+ except (TypeError, ValueError):
+ continue
+ return parsed
+
+
+def _float_or_zero(value: Any) -> float:
+ try:
+ return float(value)
+ except (TypeError, ValueError):
+ return 0.0
+
+
+def _float_or_none(value: Any) -> float | None:
+ try:
+ return float(value)
+ except (TypeError, ValueError):
+ return None
+
+
+def _int_or_none(value: Any) -> int | None:
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ return None
+
+
+def _string_or_none(value: Any) -> str | None:
+ return value if isinstance(value, str) else None
+
+
+def _mask_or_none(value: Any) -> dict[str, Any] | None:
+ return value if isinstance(value, dict) else None
+
+
+def _index_id_or_none(value: Any) -> str | None:
+ index = _int_or_none(value)
+ return f"candidate_{index}" if index is not None else None
+
+
+def _parse_image_width(result_data: dict[str, Any]) -> int | None:
+ image_size = result_data.get("image_size")
+ if isinstance(image_size, dict):
+ width = _int_or_none(image_size.get("width"))
+ if width is not None:
+ return width
+ return _int_or_none(result_data.get("image_width"))
+
+
+def _parse_image_height(result_data: dict[str, Any]) -> int | None:
+ image_size = result_data.get("image_size")
+ if isinstance(image_size, dict):
+ height = _int_or_none(image_size.get("height"))
+ if height is not None:
+ return height
+ return _int_or_none(result_data.get("image_height"))
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py
new file mode 100644
index 000000000..3945bf4bd
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/schemas.py
@@ -0,0 +1,103 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.common import ClientError
+
+__all__ = [
+ "ImageSegmentationCandidate",
+ "ImageSegmentationError",
+ "ImageSegmentationResult",
+ "ImageSegmentationServerRequest",
+ "ImageSegmentationServerResponse",
+]
+
+
+@dataclass(frozen=True)
+class ImageSegmentationServerRequest:
+ """Request sent to the SAM3 server.
+
+ Args:
+ prompt: Short text concept prompt.
+ image_path: Local input image path.
+ """
+
+ prompt: str
+ image_path: str | Path
+
+ def to_form_data(self) -> dict[str, str]:
+ """Convert the request to the SAM3 server multipart form fields."""
+ return {
+ "prompt": self.prompt,
+ "score_threshold": "0.0",
+ "max_instances": "5",
+ }
+
+
+@dataclass(frozen=True)
+class ImageSegmentationCandidate:
+ """One SAM3 segmentation candidate for a prompted concept.
+
+ SAM3 image inference returns parallel masks, boxes, and scores. The client
+ normalizes one aligned mask/box/score item into this candidate record.
+ """
+
+ candidate_id: str
+ bbox_xyxy: list[float]
+ score: float
+ mask_rle: dict[str, Any] | None = None
+ mask_path: str | None = None
+ label: str | None = None
+ metadata: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageSegmentationResult:
+ """Successful SAM3 segmentation result."""
+
+ image_path: str
+ prompt: str
+ candidates: list[ImageSegmentationCandidate]
+ request_id: str | None = None
+ elapsed_sec: float | None = None
+ count: int | None = None
+ image_width: int | None = None
+ image_height: int | None = None
+ box_format: str = "xyxy"
+ mask_format: str | None = None
+
+
+@dataclass(frozen=True)
+class ImageSegmentationServerResponse:
+ """Parsed successful response from the SAM3 server."""
+
+ ok: bool
+ result: ImageSegmentationResult
+ status: str | None = None
+ error: str | None = None
+ status_code: int | None = None
+ content_type: str | None = None
+ headers: dict[str, str] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class ImageSegmentationError(ClientError):
+ """Image segmentation failure returned by the server."""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py
new file mode 100644
index 000000000..3bef48923
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/clients/image_segmentation_client/utils.py
@@ -0,0 +1,344 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from PIL import Image, ImageDraw, ImageFilter, ImageFont
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client.schemas import (
+ ImageSegmentationCandidate,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = [
+ "apply_mask_to_alpha",
+ "bbox_iou",
+ "decode_rle_mask",
+ "draw_labeled_bboxes",
+ "draw_numbered_bboxes",
+ "draw_numbered_masks",
+ "is_usable_segmentation_candidate",
+ "save_candidate_rgba_and_mask",
+ "sort_segments_by_bbox",
+]
+
+
+def decode_rle_mask(mask_rle: dict[str, Any]) -> Image.Image:
+ """Decode an uncompressed SAM3 RLE mask into a grayscale PIL image."""
+ size = mask_rle.get("size")
+ counts = mask_rle.get("counts")
+ if not _is_size_pair(size):
+ raise ValueError("SAM3 mask_rle requires size=[height, width].")
+ if not isinstance(counts, list):
+ raise ValueError("SAM3 mask_rle counts must be an uncompressed list.")
+
+ height = int(size[0])
+ width = int(size[1])
+ expected_pixels = height * width
+ starts_with = int(mask_rle.get("starts_with", 0))
+ value = 255 if starts_with else 0
+ pixels = bytearray(expected_pixels)
+ offset = 0
+
+ for count_value in counts:
+ count = int(count_value)
+ if count < 0:
+ raise ValueError("SAM3 mask_rle counts must be non-negative.")
+ next_offset = offset + count
+ if next_offset > expected_pixels:
+ raise ValueError("SAM3 mask_rle counts exceed the expected image size.")
+ if value:
+ pixels[offset:next_offset] = b"\xff" * count
+ offset = next_offset
+ value = 0 if value else 255
+
+ if offset != expected_pixels:
+ raise ValueError(
+ "SAM3 mask_rle counts do not cover the expected image size: "
+ f"{offset} != {expected_pixels}."
+ )
+ return Image.frombytes("L", (width, height), bytes(pixels))
+
+
+def apply_mask_to_alpha(
+ image_path: str | Path,
+ mask: Image.Image,
+) -> Image.Image:
+ """Return an RGBA image whose alpha channel is the provided mask."""
+ image = Image.open(image_path).convert("RGBA")
+ alpha = mask.convert("L")
+ if alpha.size != image.size:
+ alpha = alpha.resize(image.size, Image.Resampling.NEAREST)
+ image.putalpha(alpha)
+ return image
+
+
+def save_candidate_rgba_and_mask(
+ *,
+ image_path: str | Path,
+ candidate: ImageSegmentationCandidate,
+ output_dir: str | Path,
+ prefix: str | None = None,
+) -> dict[str, str]:
+ """Save one candidate's mask image and RGBA image for SAM3D input."""
+ if candidate.mask_rle is None:
+ raise ValueError(f"Candidate {candidate.candidate_id} has no mask_rle.")
+
+ output_dir = Path(output_dir).expanduser().resolve()
+ output_dir.mkdir(parents=True, exist_ok=True)
+ filename_prefix = prefix or candidate.candidate_id
+ mask_path = output_dir / f"{filename_prefix}_mask.png"
+ rgba_path = output_dir / f"{filename_prefix}_rgba.png"
+
+ mask = decode_rle_mask(candidate.mask_rle)
+ mask.save(mask_path)
+ rgba = apply_mask_to_alpha(image_path, mask)
+ rgba.save(rgba_path)
+ log_info(f"SAM3 mask written: {mask_path}")
+ log_info(f"SAM3 RGBA image written: {rgba_path}")
+ return {
+ "mask_path": str(mask_path),
+ "rgba_path": str(rgba_path),
+ }
+
+
+def draw_numbered_bboxes(
+ *,
+ image_path: str | Path,
+ segments: list[dict[str, Any]],
+ output_path: str | Path,
+) -> Path:
+ """Draw numbered bounding boxes for visual segmentation verification."""
+ image = Image.open(image_path).convert("RGB")
+ draw = ImageDraw.Draw(image)
+ font = _load_label_font(image.width)
+ for index, segment in enumerate(segments, start=1):
+ _draw_bbox_label(
+ draw=draw,
+ bbox_xyxy=segment["bbox_xyxy"],
+ label=str(index),
+ font=font,
+ )
+
+ output_path = Path(output_path).expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ image.save(output_path)
+ return output_path
+
+
+def draw_numbered_masks(
+ *,
+ image_path: str | Path,
+ segments: list[dict[str, Any]],
+ output_path: str | Path,
+) -> Path:
+ """Draw numbered segmentation masks for visual segmentation verification."""
+ image = Image.open(image_path).convert("RGBA")
+ overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
+ draw_overlay = ImageDraw.Draw(overlay)
+ font = _load_label_font(image.width)
+ colors = [
+ (255, 64, 64, 255),
+ (64, 160, 255, 255),
+ (64, 220, 120, 255),
+ (255, 190, 64, 255),
+ (190, 96, 255, 255),
+ (255, 96, 190, 255),
+ ]
+
+ for index, segment in enumerate(segments, start=1):
+ mask_rle = segment.get("mask_rle")
+ if mask_rle is None:
+ continue
+ mask = decode_rle_mask(mask_rle)
+ if mask.size != image.size:
+ mask = mask.resize(image.size, Image.Resampling.NEAREST)
+ color = colors[(index - 1) % len(colors)]
+ outline = _mask_outline(mask)
+ color_layer = Image.new("RGBA", image.size, color)
+ transparent = Image.new("RGBA", image.size)
+ overlay.alpha_composite(Image.composite(color_layer, transparent, outline))
+ _draw_mask_label(
+ draw=draw_overlay,
+ segment=segment,
+ mask=mask,
+ label=str(index),
+ font=font,
+ )
+
+ result = Image.alpha_composite(image, overlay).convert("RGB")
+ output_path = Path(output_path).expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ result.save(output_path)
+ return output_path
+
+
+def draw_labeled_bboxes(
+ *,
+ image_path: str | Path,
+ boxes: list[dict[str, Any]],
+ output_path: str | Path,
+) -> Path:
+ """Draw labeled bounding boxes for final segmentation visualization."""
+ image = Image.open(image_path).convert("RGB")
+ draw = ImageDraw.Draw(image)
+ font = _load_label_font(image.width)
+ for box in boxes:
+ x1, y1, x2, y2 = box["bbox_xyxy"]
+ label = str(box["label"])
+ _draw_bbox_label(
+ draw=draw,
+ bbox_xyxy=[x1, y1, x2, y2],
+ label=label,
+ font=font,
+ )
+
+ output_path = Path(output_path).expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ image.save(output_path)
+ return output_path
+
+
+def sort_segments_by_bbox(segments: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Sort segments by top-left image position, then by descending score."""
+ return sorted(
+ segments,
+ key=lambda segment: (
+ float(segment["bbox_xyxy"][1]),
+ float(segment["bbox_xyxy"][0]),
+ -float(segment["score"]),
+ ),
+ )
+
+
+def bbox_iou(bbox_a: list[float], bbox_b: list[float]) -> float:
+ """Compute IoU for two xyxy bounding boxes."""
+ ax1, ay1, ax2, ay2 = bbox_a
+ bx1, by1, bx2, by2 = bbox_b
+ ix1 = max(ax1, bx1)
+ iy1 = max(ay1, by1)
+ ix2 = min(ax2, bx2)
+ iy2 = min(ay2, by2)
+ iw = max(0.0, ix2 - ix1)
+ ih = max(0.0, iy2 - iy1)
+ intersection = iw * ih
+ area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+ area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+ union = area_a + area_b - intersection
+ return intersection / union if union > 0 else 0.0
+
+
+def is_usable_segmentation_candidate(
+ candidate: ImageSegmentationCandidate,
+) -> bool:
+ """Return whether a candidate has the fields needed by downstream stages."""
+ return candidate.mask_rle is not None and len(candidate.bbox_xyxy) == 4
+
+
+def _is_size_pair(value: Any) -> bool:
+ return (
+ isinstance(value, list)
+ and len(value) == 2
+ and isinstance(value[0], int)
+ and isinstance(value[1], int)
+ )
+
+
+def _load_label_font(image_width: int) -> ImageFont.ImageFont:
+ font_size = max(24, image_width // 80)
+ try:
+ return ImageFont.truetype("DejaVuSans-Bold.ttf", font_size)
+ except OSError:
+ return ImageFont.load_default()
+
+
+def _draw_bbox_label(
+ *,
+ draw: ImageDraw.ImageDraw,
+ bbox_xyxy: list[float],
+ label: str,
+ font: ImageFont.ImageFont,
+) -> None:
+ x1, y1, x2, y2 = bbox_xyxy
+ draw.rectangle((x1, y1, x2, y2), outline="red", width=6)
+ label_box = draw.textbbox((x1, y1), label, font=font)
+ padding = 8
+ draw.rectangle(
+ (
+ label_box[0] - padding,
+ label_box[1] - padding,
+ label_box[2] + padding,
+ label_box[3] + padding,
+ ),
+ fill="red",
+ )
+ draw.text((x1, y1), label, fill="white", font=font)
+
+
+def _draw_mask_label(
+ *,
+ draw: ImageDraw.ImageDraw,
+ segment: dict[str, Any],
+ mask: Image.Image,
+ label: str,
+ font: ImageFont.ImageFont,
+) -> None:
+ anchor = _mask_visible_pixel_centroid(mask)
+ if anchor is None:
+ x1, y1, x2, y2 = segment["bbox_xyxy"]
+ x = float(x1 + x2) * 0.5
+ y = float(y1 + y2) * 0.5
+ else:
+ x, y = anchor
+
+ label_box = draw.textbbox((0, 0), label, font=font)
+ padding = 8
+ label_width = label_box[2] - label_box[0]
+ label_height = label_box[3] - label_box[1]
+ text_x = x - label_width * 0.5
+ text_y = y - label_height * 0.5
+ draw.rectangle(
+ (
+ text_x - padding,
+ text_y - padding,
+ text_x + label_width + padding,
+ text_y + label_height + padding,
+ ),
+ fill="red",
+ outline="white",
+ width=3,
+ )
+ draw.text((text_x, text_y), label, fill="white", font=font)
+
+
+def _mask_visible_pixel_centroid(mask: Image.Image) -> tuple[float, float] | None:
+ """Return the centroid of actual visible mask pixels, not the bbox center."""
+ alpha = np.asarray(mask.convert("L"), dtype=np.uint8)
+ ys, xs = np.nonzero(alpha > 0)
+ if len(xs) == 0:
+ return None
+ return float(np.mean(xs)), float(np.mean(ys))
+
+
+def _mask_outline(mask: Image.Image) -> Image.Image:
+ """Return a thick binary outline so overlays do not recolor the object."""
+ alpha = mask.convert("L")
+ edge = alpha.filter(ImageFilter.FIND_EDGES)
+ return edge.filter(ImageFilter.MaxFilter(5))
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py
new file mode 100644
index 000000000..32f8ef6cd
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.manager import (
+ BlenderRenderingManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.schemas import (
+ RenderObjectScenesRequest,
+ RenderObjectScenesResult,
+)
+
+__all__ = [
+ "BlenderRenderingManager",
+ "RenderObjectScenesRequest",
+ "RenderObjectScenesResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py
new file mode 100644
index 000000000..6ae3d0c6e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/manager.py
@@ -0,0 +1,175 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager.schemas import (
+ RenderObjectScenesRequest,
+ RenderObjectScenesResult,
+)
+
+__all__ = ["BlenderRenderingManager"]
+
+
+class BlenderRenderingManager:
+ """Render simulation scenes through Blender's background CLI."""
+
+ def render_object_scenes(
+ self,
+ request: RenderObjectScenesRequest,
+ ) -> RenderObjectScenesResult:
+ """Render a front-oblique view of a collection of Z-up scenes."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with tempfile.TemporaryDirectory(prefix="p2s_blender_render_") as tmp_dir:
+ glb_paths = self._export_y_up_scenes(
+ request.object_scenes,
+ Path(tmp_dir),
+ )
+ self._render_glbs(
+ glb_paths,
+ output_path,
+ timeout_seconds=request.timeout_seconds,
+ )
+ return RenderObjectScenesResult(output_path=output_path)
+
+ @staticmethod
+ def _export_y_up_scenes(
+ object_scenes: list[tuple[str, object]],
+ output_dir: Path,
+ ) -> list[Path]:
+ z_up_to_y_up = np.array(
+ [
+ [1.0, 0.0, 0.0, 0.0],
+ [0.0, 0.0, 1.0, 0.0],
+ [0.0, -1.0, 0.0, 0.0],
+ [0.0, 0.0, 0.0, 1.0],
+ ],
+ dtype=np.float64,
+ )
+ paths: list[Path] = []
+ for object_id, scene in object_scenes:
+ path = output_dir / f"{object_id}_render.glb"
+ copied = scene.copy()
+ copied.apply_transform(z_up_to_y_up)
+ copied.export(path)
+ paths.append(path)
+ return paths
+
+ @classmethod
+ def _render_glbs(
+ cls,
+ glb_paths: list[Path],
+ output_path: Path,
+ *,
+ timeout_seconds: int,
+ ) -> None:
+ script = cls._front_oblique_script(glb_paths, output_path)
+ with tempfile.NamedTemporaryFile(
+ mode="w",
+ suffix=".py",
+ encoding="utf-8",
+ delete=False,
+ ) as file:
+ script_path = Path(file.name)
+ file.write(script)
+ try:
+ subprocess.run(
+ ["blender", "--background", "--python", str(script_path)],
+ check=True,
+ timeout=timeout_seconds,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ )
+ except subprocess.CalledProcessError as exc:
+ stderr_tail = (exc.stderr or "").strip()[-4000:]
+ raise RuntimeError(
+ f"Blender front-oblique render failed:\n{stderr_tail}"
+ ) from exc
+ finally:
+ script_path.unlink(missing_ok=True)
+ if not output_path.is_file():
+ raise FileNotFoundError(f"Blender render was not written: {output_path}")
+
+ @staticmethod
+ def _front_oblique_script(glb_paths: list[Path], output_path: Path) -> str:
+ object_paths_json = json.dumps([str(path.resolve()) for path in glb_paths])
+ output_path_json = json.dumps(str(output_path.resolve()))
+ return f"""\
+import bpy
+import json
+import mathutils
+
+object_paths = json.loads({object_paths_json!r})
+output_path = json.loads({output_path_json!r})
+bpy.ops.object.select_all(action="SELECT")
+bpy.ops.object.delete()
+for path in object_paths:
+ bpy.ops.import_scene.gltf(filepath=path)
+mesh_objects = [obj for obj in bpy.context.scene.objects if obj.type == "MESH"]
+if not mesh_objects:
+ raise RuntimeError("No mesh objects were imported.")
+min_corner = mathutils.Vector((float("inf"), float("inf"), float("inf")))
+max_corner = mathutils.Vector((float("-inf"), float("-inf"), float("-inf")))
+for obj in mesh_objects:
+ for corner in obj.bound_box:
+ world = obj.matrix_world @ mathutils.Vector(corner)
+ min_corner.x = min(min_corner.x, world.x)
+ min_corner.y = min(min_corner.y, world.y)
+ min_corner.z = min(min_corner.z, world.z)
+ max_corner.x = max(max_corner.x, world.x)
+ max_corner.y = max(max_corner.y, world.y)
+ max_corner.z = max(max_corner.z, world.z)
+center = (min_corner + max_corner) * 0.5
+span_x = max(max_corner.x - min_corner.x, 1.0e-4)
+span_y = max(max_corner.y - min_corner.y, 1.0e-4)
+span_z = max(max_corner.z - min_corner.z, 1.0e-4)
+camera_data = bpy.data.cameras.new("front_oblique_camera")
+camera = bpy.data.objects.new("front_oblique_camera", camera_data)
+bpy.context.collection.objects.link(camera)
+view_distance = max(span_x, span_y, span_z) * 2.4
+camera.location = (center.x, center.y - view_distance, center.z + view_distance * 0.75)
+camera.rotation_euler = (center - camera.location).to_track_quat("-Z", "Y").to_euler()
+camera_data.type = "ORTHO"
+camera_data.ortho_scale = max(span_x, span_y, span_z * 1.8) * 1.35
+bpy.context.scene.camera = camera
+light_data = bpy.data.lights.new("front_oblique_area_light", "AREA")
+light = bpy.data.objects.new("front_oblique_area_light", light_data)
+bpy.context.collection.objects.link(light)
+light.location = camera.location
+light_data.energy = 350.0
+light_data.size = max(span_x, span_y) * 2.0
+bpy.context.scene.world.color = (0.90, 0.90, 0.90)
+try:
+ bpy.context.scene.render.engine = "BLENDER_EEVEE_NEXT"
+except Exception:
+ bpy.context.scene.render.engine = "BLENDER_EEVEE"
+bpy.context.scene.render.resolution_x = 768
+bpy.context.scene.render.resolution_y = 768
+bpy.context.scene.render.film_transparent = False
+bpy.context.scene.view_settings.view_transform = "Standard"
+bpy.context.scene.view_settings.look = "Medium Contrast"
+bpy.context.scene.render.filepath = output_path
+bpy.ops.render.render(write_still=True)
+"""
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py
new file mode 100644
index 000000000..e3f986c7f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/blender_rendering_manager/schemas.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = ["RenderObjectScenesRequest", "RenderObjectScenesResult"]
+
+
+@dataclass(frozen=True)
+class RenderObjectScenesRequest:
+ """Request to render internal Z-up object scenes with Blender."""
+
+ object_scenes: list[tuple[str, Any]]
+ output_path: Path
+ timeout_seconds: int = 180
+
+
+@dataclass(frozen=True)
+class RenderObjectScenesResult:
+ """Result of rendering object scenes."""
+
+ output_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py
new file mode 100644
index 000000000..ef8b93154
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/__init__.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.manager import (
+ GeometryGenerationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.schemas import (
+ GeometryGenerationRequest,
+ GeometryGenerationResult,
+ MultiObjectGenerationObject,
+ MultiObjectGenerationRequest,
+ MultiObjectGenerationResult,
+ RgbaImageToGeometryRequest,
+ RgbaImagesToGeometriesObject,
+ RgbaImagesToGeometriesRequest,
+ RgbaImagesToGeometriesResult,
+)
+
+__all__ = [
+ "GeometryGenerationManager",
+ "GeometryGenerationRequest",
+ "GeometryGenerationResult",
+ "MultiObjectGenerationObject",
+ "MultiObjectGenerationRequest",
+ "MultiObjectGenerationResult",
+ "RgbaImageToGeometryRequest",
+ "RgbaImagesToGeometriesObject",
+ "RgbaImagesToGeometriesRequest",
+ "RgbaImagesToGeometriesResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py
new file mode 100644
index 000000000..d30ea09aa
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/manager.py
@@ -0,0 +1,209 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from PIL import Image
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.geometry_generation_client import (
+ GeometryGenerationClient,
+ GeometryGenerationError,
+ GeometryGenerationServerRequest,
+ MultiObjectGenerationError,
+ MultiObjectGenerationServerRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager.schemas import (
+ GeometryGenerationRequest,
+ GeometryGenerationResult,
+ MultiObjectGenerationObject,
+ MultiObjectGenerationRequest,
+ MultiObjectGenerationResult,
+ RgbaImageToGeometryRequest,
+ RgbaImagesToGeometriesObject,
+ RgbaImagesToGeometriesRequest,
+ RgbaImagesToGeometriesResult,
+)
+
+
+class GeometryGenerationManager:
+ """Geometry generation domain operations."""
+
+ def __init__(self, *, client: GeometryGenerationClient | None = None) -> None:
+ self.client = client or GeometryGenerationClient()
+
+ def generate_single_object_mesh(
+ self,
+ request: GeometryGenerationRequest,
+ ) -> GeometryGenerationResult:
+ image_path = request.image_path.expanduser().resolve()
+ output_path = request.output_path.expanduser().resolve()
+ _validate_single_object_request(image_path=image_path, output_path=output_path)
+
+ response = self.client.generate(
+ GeometryGenerationServerRequest(
+ image_path=image_path,
+ output_path=output_path,
+ ),
+ )
+ if isinstance(response, GeometryGenerationError):
+ raise RuntimeError(response.error_message)
+
+ return GeometryGenerationResult(
+ output_path=Path(response.result.geometry_path).expanduser().resolve(),
+ )
+
+ def generate_multi_object_meshes(
+ self,
+ request: MultiObjectGenerationRequest,
+ ) -> MultiObjectGenerationResult:
+ image_path = request.image_path.expanduser().resolve()
+ output_dir = request.output_dir.expanduser().resolve()
+ _validate_multi_object_request(
+ image_path=image_path,
+ mask_paths=request.mask_paths,
+ output_dir=output_dir,
+ )
+
+ response = self.client.generate_multiple_objects(
+ MultiObjectGenerationServerRequest(
+ image_path=image_path,
+ mask_paths=[p.expanduser().resolve() for p in request.mask_paths],
+ ),
+ output_dir=output_dir,
+ )
+ if isinstance(response, MultiObjectGenerationError):
+ raise RuntimeError(response.error_message)
+
+ objects = [
+ MultiObjectGenerationObject(
+ name=item.name,
+ geometry_path=Path(item.geometry_path).expanduser().resolve(),
+ rotation_quaternion_wxyz=item.rotation_quaternion_wxyz,
+ translation=item.translation,
+ scale=item.scale,
+ )
+ for item in response.result.objects
+ ]
+ return MultiObjectGenerationResult(objects=objects)
+
+ def convert_rgba_image_to_geometry(
+ self,
+ request: RgbaImageToGeometryRequest,
+ ) -> Path:
+ image_path = request.image_path.expanduser().resolve()
+ output_path = request.output_path.expanduser().resolve()
+ _validate_rgba_image(image_path)
+
+ result = self.generate_single_object_mesh(
+ GeometryGenerationRequest(image_path=image_path, output_path=output_path)
+ )
+ return _postprocess_mesh(result.output_path)
+
+ def convert_rgba_images_to_geometries(
+ self,
+ request: RgbaImagesToGeometriesRequest,
+ ) -> RgbaImagesToGeometriesResult:
+ image_path = request.image_path.expanduser().resolve()
+ output_dir = request.output_dir.expanduser().resolve()
+ _validate_rgba_images_request(image_path, request.mask_paths)
+
+ result = self.generate_multi_object_meshes(
+ MultiObjectGenerationRequest(
+ image_path=image_path,
+ mask_paths=request.mask_paths,
+ output_dir=output_dir,
+ )
+ )
+ objects = [
+ RgbaImagesToGeometriesObject(
+ name=item.name,
+ geometry_path=_postprocess_mesh(item.geometry_path),
+ rotation_quaternion_wxyz=item.rotation_quaternion_wxyz,
+ translation=item.translation,
+ scale=item.scale,
+ )
+ for item in result.objects
+ ]
+ return RgbaImagesToGeometriesResult(objects=objects)
+
+
+def _validate_single_object_request(*, image_path: Path, output_path: Path) -> None:
+ if not image_path.is_file():
+ raise FileNotFoundError(f"Geometry generation input not found: {image_path}")
+ if output_path.suffix.lower() != ".glb":
+ raise ValueError("Geometry generation output_path must be a GLB file path.")
+ if output_path.exists() and output_path.is_dir():
+ raise ValueError(f"Geometry generation output_path is a directory: {output_path}")
+
+
+def _validate_multi_object_request(
+ *,
+ image_path: Path,
+ mask_paths: list[Path],
+ output_dir: Path,
+) -> None:
+ if not image_path.is_file():
+ raise FileNotFoundError(
+ f"Multi-object geometry generation input not found: {image_path}"
+ )
+ if not mask_paths:
+ raise ValueError("mask_paths must be non-empty.")
+ for mask_path in mask_paths:
+ mask_path_resolved = mask_path.expanduser().resolve()
+ if not mask_path_resolved.is_file():
+ raise FileNotFoundError(
+ f"Multi-object geometry mask not found: {mask_path_resolved}"
+ )
+ if output_dir.exists() and not output_dir.is_dir():
+ raise ValueError(
+ f"Multi-object geometry output_dir is not a directory: {output_dir}"
+ )
+
+
+def _validate_rgba_image(image_path: Path) -> None:
+ if not image_path.is_file():
+ raise FileNotFoundError(f"RGBA image not found: {image_path}")
+
+ with Image.open(image_path) as image:
+ if image.mode in {"RGBA", "LA"}:
+ return
+ if image.mode == "P" and "transparency" in image.info:
+ return
+ raise ValueError(
+ "Geometry tool requires an image with an alpha channel, "
+ f"got mode={image.mode!r}: {image_path}"
+ )
+
+
+def _validate_rgba_images_request(
+ image_path: Path,
+ mask_paths: list[Path],
+) -> None:
+ if not image_path.is_file():
+ raise FileNotFoundError(f"Scene image not found: {image_path}")
+ with Image.open(image_path):
+ pass
+ if not mask_paths:
+ raise ValueError("mask_paths must be non-empty.")
+ for mask_path in mask_paths:
+ if not mask_path.expanduser().resolve().is_file():
+ raise FileNotFoundError(f"Mask not found: {mask_path}")
+
+
+def _postprocess_mesh(mesh_path: Path) -> Path:
+ return mesh_path.expanduser().resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py
new file mode 100644
index 000000000..81f6816a8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_generation_manager/schemas.py
@@ -0,0 +1,105 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class RgbaImageToGeometryRequest:
+ """Request for converting one RGBA asset image to one mesh."""
+
+ image_path: Path
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesRequest:
+ """Request for converting a scene image with object masks to meshes."""
+
+ image_path: Path
+ mask_paths: list[Path]
+ output_dir: Path
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesObject:
+ """One generated object mesh and its scene placement."""
+
+ name: str
+ geometry_path: Path
+ rotation_quaternion_wxyz: list[float]
+ translation: list[float]
+ scale: list[float]
+
+
+@dataclass(frozen=True)
+class RgbaImagesToGeometriesResult:
+ """Result of multi-object geometry generation."""
+
+ objects: list[RgbaImagesToGeometriesObject]
+
+ @property
+ def geometry_paths(self) -> list[Path]:
+ return [item.geometry_path for item in self.objects]
+
+
+@dataclass(frozen=True)
+class GeometryGenerationRequest:
+ """Request for generating one object mesh from one image."""
+
+ image_path: Path
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class GeometryGenerationResult:
+ """Generated mesh path."""
+
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationRequest:
+ """Request to generate multiple object meshes from one image and masks."""
+
+ image_path: Path
+ mask_paths: list[Path]
+ output_dir: Path
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationObject:
+ """One generated object mesh and its scene placement."""
+
+ name: str
+ geometry_path: Path
+ rotation_quaternion_wxyz: list[float]
+ translation: list[float]
+ scale: list[float]
+
+
+@dataclass(frozen=True)
+class MultiObjectGenerationResult:
+ """Result of multi-object geometry generation."""
+
+ objects: list[MultiObjectGenerationObject]
+
+ @property
+ def geometry_paths(self) -> list[Path]:
+ return [item.geometry_path for item in self.objects]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py
new file mode 100644
index 000000000..7d70c81c9
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/__init__.py
@@ -0,0 +1,69 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.manager import (
+ DEFAULT_INPUT_UP_AXIS,
+ DEFAULT_UP_AXIS,
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+ AlignToAxisRequest,
+ AlignToAxisResult,
+ AlignXYLongAxisRequest,
+ AlignXYLongAxisResult,
+ CenterMeshRequest,
+ CenterMeshResult,
+ ConvertUpAxisRequest,
+ ConvertUpAxisResult,
+ DetectTabletopRequest,
+ DetectTabletopResult,
+ ExportMeshRequest,
+ ExportMeshResult,
+ LoadMeshRequest,
+ LoadMeshResult,
+ NormalizeRequest,
+ NormalizeResult,
+ PlaceAbovePlaneRequest,
+ PlaceAbovePlaneResult,
+ SupportPlaneCandidate,
+)
+
+__all__ = [
+ "AlignToAxisRequest",
+ "AlignToAxisResult",
+ "AlignXYLongAxisRequest",
+ "AlignXYLongAxisResult",
+ "CenterMeshRequest",
+ "CenterMeshResult",
+ "ConvertUpAxisRequest",
+ "ConvertUpAxisResult",
+ "DEFAULT_INPUT_UP_AXIS",
+ "DEFAULT_UP_AXIS",
+ "DetectTabletopRequest",
+ "DetectTabletopResult",
+ "ExportMeshRequest",
+ "ExportMeshResult",
+ "GeometryManager",
+ "LoadMeshRequest",
+ "LoadMeshResult",
+ "NormalizeRequest",
+ "NormalizeResult",
+ "PlaceAbovePlaneRequest",
+ "PlaceAbovePlaneResult",
+ "SupportPlaneCandidate",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py
new file mode 100644
index 000000000..ec7a18c6f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/manager.py
@@ -0,0 +1,794 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Geometry manager for mesh I/O, transforms, and tabletop detection."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import trimesh
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+ AlignToAxisRequest,
+ AlignToAxisResult,
+ AlignXYLongAxisRequest,
+ AlignXYLongAxisResult,
+ CenterMeshRequest,
+ CenterMeshResult,
+ ConvertUpAxisRequest,
+ ConvertUpAxisResult,
+ DetectTabletopRequest,
+ DetectTabletopResult,
+ ExportMeshRequest,
+ ExportMeshResult,
+ LoadMeshRequest,
+ LoadMeshResult,
+ NormalizeRequest,
+ NormalizeResult,
+ PlaceAbovePlaneRequest,
+ PlaceAbovePlaneResult,
+ SupportPlaneCandidate,
+)
+
+__all__ = ["GeometryManager"]
+
+DEFAULT_INPUT_UP_AXIS = [0.0, 1.0, 0.0]
+DEFAULT_UP_AXIS = [0.0, 0.0, 1.0]
+
+
+class GeometryManager:
+ """Manager for mesh geometry operations.
+
+ Provides typed methods for mesh I/O, axis conversion, bounding-box
+ transforms, tabletop plane detection, and PCA alignment, following
+ the same pattern as service clients.
+ """
+
+ @staticmethod
+ def compose_json_matrices(*values: Any) -> list[list[float]]:
+ from . import utils as geometry_utils
+
+ return geometry_utils._compose_json_matrices(*values)
+
+ @staticmethod
+ def compose_simready_to_aligned_matrix(
+ *, raw_to_aligned_matrix: Any, raw_to_simready_matrix: Any
+ ) -> list[list[float]]:
+ from . import utils as geometry_utils
+
+ return geometry_utils._compose_simready_to_aligned_matrix(
+ raw_to_aligned_matrix=raw_to_aligned_matrix,
+ raw_to_simready_matrix=raw_to_simready_matrix,
+ )
+
+ @staticmethod
+ def decompose_transform_matrix(matrix_value: Any) -> dict[str, Any]:
+ from . import utils as geometry_utils
+
+ return geometry_utils._decompose_transform_matrix(matrix_value)
+
+ @staticmethod
+ def support_normal_flip_transform(**kwargs: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._support_normal_flip_transform(**kwargs)
+
+ @staticmethod
+ def z_yaw_transform(yaw_degrees: float) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._z_yaw_transform(yaw_degrees)
+
+ @staticmethod
+ def z_up_to_glb_y_up_transform() -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._z_up_to_glb_y_up_transform()
+
+ @staticmethod
+ def copy_scene_with_transform(scene: Any, transform: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._copy_scene_with_transform(scene, transform)
+
+ @staticmethod
+ def matrix_from_json(value: Any, *, name: str) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._matrix_from_json(value, name=name)
+
+ @staticmethod
+ def load_scene_with_transform(**kwargs: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._load_scene_with_transform(**kwargs)
+
+ @staticmethod
+ def estimate_support_normal(mesh: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._estimate_support_normal(mesh)
+
+ @staticmethod
+ def rotation_between_vectors(source: Any, target: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._rotation_between_vectors(source, target)
+
+ @staticmethod
+ def transform_point(transform: Any, point: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._transform_point(transform, point)
+
+ @staticmethod
+ def aabb_center(bounds: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._aabb_center(bounds)
+
+ @staticmethod
+ def xy_aabb_center(bounds: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._xy_aabb_center(bounds)
+
+ @staticmethod
+ def xy_aabb_size(bounds: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._xy_aabb_size(bounds)
+
+ @staticmethod
+ def aabb_bottom_to_xy_plane_transform(bounds: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._aabb_bottom_to_xy_plane_transform(bounds)
+
+ @staticmethod
+ def scale_transform(scale: float) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._scale_transform(scale)
+
+ @staticmethod
+ def compose_sam3d_multi_object_transform(**kwargs: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._compose_sam3d_multi_object_transform(**kwargs)
+
+ @staticmethod
+ def detect_table_fit_support_quad(
+ mesh: Any,
+ *,
+ target_aspect: float,
+ ) -> dict[str, Any]:
+ from . import utils as geometry_utils
+
+ return geometry_utils._detect_table_fit_support_quad(
+ mesh,
+ target_aspect=target_aspect,
+ )
+
+ @staticmethod
+ def load_table_fit_scene_internal_z(path: Path, *, trimesh: Any, y_to_z: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._load_table_fit_scene_internal_z(
+ path,
+ trimesh=trimesh,
+ y_to_z=y_to_z,
+ )
+
+ @staticmethod
+ def table_fit_scene_union_bounds(scenes: list[Any], *, trimesh: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._table_fit_scene_union_bounds(scenes, trimesh=trimesh)
+
+ @staticmethod
+ def table_fit_bounds_xy_manifest(
+ bounds: Any,
+ *,
+ unit_scale: float,
+ ) -> dict[str, Any]:
+ from . import utils as geometry_utils
+
+ return geometry_utils._table_fit_bounds_xy_manifest(
+ bounds,
+ unit_scale=unit_scale,
+ )
+
+ @staticmethod
+ def table_fit_uniform_xy_scale_transform(**kwargs: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._table_fit_uniform_xy_scale_transform(**kwargs)
+
+ @staticmethod
+ def table_fit_uniform_scale_transform(**kwargs: Any) -> Any:
+ from . import utils as geometry_utils
+
+ return geometry_utils._table_fit_uniform_scale_transform(**kwargs)
+
+ @staticmethod
+ def table_fit_safe_positive_ratio(numerator: float, denominator: float) -> float:
+ from . import utils as geometry_utils
+
+ return geometry_utils._table_fit_safe_positive_ratio(numerator, denominator)
+
+ @staticmethod
+ def load_mesh(request: LoadMeshRequest) -> LoadMeshResult:
+ """Load a GLB/mesh file as one Trimesh object."""
+ mesh_path = request.mesh_path.expanduser().resolve()
+ if not mesh_path.is_file():
+ raise FileNotFoundError(f"Mesh file not found: {mesh_path}")
+
+ loaded = trimesh.load(mesh_path, force=None)
+ if isinstance(loaded, trimesh.Scene):
+ geometries = [
+ g
+ for g in loaded.dump(concatenate=False)
+ if hasattr(g, "vertices") and hasattr(g, "faces")
+ ]
+ if not geometries:
+ raise ValueError(f"Scene contains no mesh geometry: {mesh_path}")
+ return LoadMeshResult(mesh=trimesh.util.concatenate(geometries))
+ return LoadMeshResult(mesh=loaded)
+
+ @staticmethod
+ def export_mesh(request: ExportMeshRequest) -> ExportMeshResult:
+ """Export a mesh and return the resolved output path."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ request.mesh.export(output_path)
+ if not output_path.is_file():
+ raise FileNotFoundError(f"Mesh was not written: {output_path}")
+ return ExportMeshResult(output_path=output_path)
+
+
+ @staticmethod
+ def convert_up_axis(request: ConvertUpAxisRequest) -> ConvertUpAxisResult:
+ """Convert a mesh from one up-axis convention to another."""
+ mesh = GeometryManager._align_vector_to_axis(
+ request.mesh,
+ source_axis=request.input_up_axis or DEFAULT_INPUT_UP_AXIS,
+ target_axis=request.output_up_axis or DEFAULT_UP_AXIS,
+ )
+ return ConvertUpAxisResult(mesh=mesh)
+
+ @staticmethod
+ def center_by_bbox(request: CenterMeshRequest) -> CenterMeshResult:
+ """Center a mesh by its bounding box."""
+ GeometryManager._validate_mesh(request.mesh)
+
+ bounds = np.asarray(request.mesh.bounds, dtype=float)
+ if bounds.shape != (2, 3):
+ raise ValueError("Mesh bounds must have shape (2, 3).")
+
+ bbox_center = (bounds[0] + bounds[1]) * 0.5
+ centered = request.mesh.copy()
+ centered.apply_translation(-bbox_center)
+ return CenterMeshResult(
+ mesh=centered,
+ bbox_center=[float(v) for v in bbox_center],
+ )
+
+ @staticmethod
+ def align_to_axis(request: AlignToAxisRequest) -> AlignToAxisResult:
+ """Rotate a mesh so a source vector aligns to a target axis."""
+ mesh = GeometryManager._align_vector_to_axis(
+ request.mesh,
+ source_axis=request.source_axis,
+ target_axis=request.target_axis,
+ )
+ return AlignToAxisResult(mesh=mesh)
+
+ @staticmethod
+ def place_above_plane(
+ request: PlaceAbovePlaneRequest,
+ ) -> PlaceAbovePlaneResult:
+ """Translate a mesh so its AABB bottom is above the XY plane."""
+ if request.clearance < 0.0:
+ raise ValueError("clearance must be non-negative.")
+
+ bounds = np.asarray(request.mesh.bounds, dtype=float)
+ if bounds.shape != (2, 3):
+ raise ValueError("Mesh bounds must have shape (2, 3).")
+
+ min_z = float(bounds[0][2])
+ placed = request.mesh.copy()
+ placed.apply_translation([0.0, 0.0, request.clearance - min_z])
+ return PlaceAbovePlaneResult(mesh=placed)
+
+ @staticmethod
+ def normalize(request: NormalizeRequest) -> NormalizeResult:
+ """Scale a mesh so its longest bounding-box axis equals target_size."""
+ if request.target_size <= 0.0:
+ raise ValueError("target_size must be positive.")
+
+ extents = np.asarray(
+ request.mesh.bounding_box_oriented.primitive.extents, dtype=float
+ )
+ scale_factor = request.target_size / float(np.max(extents))
+ normalized = request.mesh.copy()
+ normalized.apply_scale(scale_factor)
+ return NormalizeResult(mesh=normalized, scale_factor=scale_factor)
+
+ @staticmethod
+ def mesh_aabb_size(mesh: Any) -> Any:
+ """Return a mesh AABB size vector."""
+ bounds = np.asarray(mesh.bounds, dtype=np.float64)
+ if bounds.shape != (2, 3):
+ raise ValueError("Mesh bounds must have shape (2, 3).")
+ size = bounds[1] - bounds[0]
+ if np.any(size <= 0.0):
+ raise ValueError(f"Mesh AABB size must be positive, got {size.tolist()}.")
+ return size
+
+ @staticmethod
+ def mesh_pca_bbox_size(mesh: Any) -> Any:
+ """Return bbox extents in the mesh PCA frame.
+
+ This is used for metric-scale estimation because it is less sensitive
+ to arbitrary object yaw/tilt than a world-axis AABB.
+ """
+ vertices = np.asarray(mesh.vertices, dtype=np.float64)
+ if vertices.ndim != 2 or vertices.shape[1] != 3 or len(vertices) < 3:
+ return GeometryManager.mesh_aabb_size(mesh)
+
+ centered = vertices - np.mean(vertices, axis=0)
+ cov = np.cov(centered, rowvar=False)
+ if cov.shape != (3, 3) or not np.all(np.isfinite(cov)):
+ return GeometryManager.mesh_aabb_size(mesh)
+
+ eigvals, eigvecs = np.linalg.eigh(cov)
+ order = np.argsort(eigvals)[::-1]
+ axes = eigvecs[:, order]
+ if np.linalg.det(axes) < 0.0:
+ axes[:, -1] *= -1.0
+
+ projected = centered @ axes
+ size = projected.max(axis=0) - projected.min(axis=0)
+ if np.any(size <= 0.0) or not np.all(np.isfinite(size)):
+ return GeometryManager.mesh_aabb_size(mesh)
+ return size
+
+ @staticmethod
+ def mesh_metric_bbox_size(mesh: Any) -> Any:
+ """Return the bbox size used by metric-scale estimation."""
+ return GeometryManager.mesh_pca_bbox_size(mesh)
+
+ @staticmethod
+ def bbox_ratio(size: Any) -> Any:
+ """Return bbox dimensions normalized by the largest axis."""
+ size = np.asarray(size, dtype=np.float64)
+ max_size = float(np.max(size))
+ if max_size <= 0.0:
+ raise ValueError("bbox size max must be positive.")
+ return size / max_size
+
+ @staticmethod
+ def best_axis_bbox_scale_match(
+ *,
+ source_size_cm: Any,
+ target_size_cm: Any,
+ ) -> dict[str, Any]:
+ """Match target bbox axes to source axes and return a scale candidate."""
+ source = np.asarray(source_size_cm, dtype=np.float64)
+ target = np.asarray(target_size_cm, dtype=np.float64)
+ if source.shape != (3,) or target.shape != (3,):
+ raise ValueError("source_size_cm and target_size_cm must have shape (3,).")
+ if np.any(source <= 0.0) or np.any(target <= 0.0):
+ raise ValueError("source_size_cm and target_size_cm must be positive.")
+
+ source_ratio = GeometryManager.bbox_ratio(source)
+ best: dict[str, Any] | None = None
+ for permutation in [
+ (0, 1, 2),
+ (0, 2, 1),
+ (1, 0, 2),
+ (1, 2, 0),
+ (2, 0, 1),
+ (2, 1, 0),
+ ]:
+ target_perm = target[list(permutation)]
+ target_ratio = GeometryManager.bbox_ratio(target_perm)
+ ratio_error = GeometryManager._mean_abs_log_ratio_error(
+ source_ratio,
+ target_ratio,
+ )
+ per_axis_scale = target_perm / source
+ candidate = {
+ "target_permutation": list(permutation),
+ "source_size_cm": source.tolist(),
+ "target_size_cm_original_order": target.tolist(),
+ "target_size_cm_matched_to_source_axes": target_perm.tolist(),
+ "source_ratio": source_ratio.tolist(),
+ "target_ratio_matched": target_ratio.tolist(),
+ "per_axis_scale": per_axis_scale.tolist(),
+ "scale_factor": float(np.median(per_axis_scale)),
+ "shape_ratio_error": float(ratio_error),
+ }
+ if best is None or ratio_error < float(best["shape_ratio_error"]):
+ best = candidate
+ if best is None:
+ raise ValueError("Failed to match bbox axes.")
+ return best
+
+ @staticmethod
+ def scene_to_mesh(scene: Any, *, trimesh: Any | None = None) -> Any:
+ """Convert a trimesh Scene or mesh-like object to one mesh."""
+ trimesh_module = globals()["trimesh"]
+ if trimesh is not None:
+ trimesh_module = trimesh
+ if isinstance(scene, trimesh_module.Trimesh):
+ return scene
+ dumped = scene.dump(concatenate=True)
+ if isinstance(dumped, trimesh_module.Trimesh):
+ return dumped
+ meshes = [
+ item for item in dumped if isinstance(item, trimesh_module.Trimesh)
+ ]
+ if not meshes:
+ raise ValueError("Scene contains no mesh geometry.")
+ return trimesh_module.util.concatenate(meshes)
+
+ @staticmethod
+ def detect_tabletop(
+ request: DetectTabletopRequest,
+ ) -> DetectTabletopResult:
+ """Detect the most likely tabletop plane in a mesh."""
+ candidates = GeometryManager._find_support_plane_candidates(
+ request.mesh,
+ normal_angle_tol_deg=request.normal_angle_tol_deg,
+ plane_distance_tol=request.plane_distance_tol,
+ min_area_ratio=request.min_area_ratio,
+ max_candidates=request.max_candidates,
+ )
+ selected = GeometryManager._select_tabletop_plane(candidates)
+ oriented_normal = GeometryManager._orient_plane_normal(
+ request.mesh,
+ plane_normal=selected.normal,
+ plane_center=selected.center,
+ )
+ return DetectTabletopResult(
+ selected=selected,
+ oriented_normal=oriented_normal,
+ candidates=candidates,
+ )
+
+
+ @staticmethod
+ def align_xy_long_axis(
+ request: AlignXYLongAxisRequest,
+ ) -> AlignXYLongAxisResult:
+ """Rotate a table so its XY-projected long axis aligns with the Y axis."""
+ vertices = np.asarray(request.mesh.vertices, dtype=float)
+ xy_vertices = GeometryManager._select_xy_vertices(
+ request.mesh, vertices, request.face_indices
+ )
+ if xy_vertices.shape[0] < 2:
+ raise ValueError(
+ "Mesh must contain at least two vertices for PCA alignment."
+ )
+
+ centered_xy = xy_vertices - np.mean(xy_vertices, axis=0)
+ covariance = centered_xy.T @ centered_xy / max(centered_xy.shape[0] - 1, 1)
+ eigenvalues, eigenvectors = np.linalg.eigh(covariance)
+ long_axis = eigenvectors[:, int(np.argmax(eigenvalues))]
+ if float(np.linalg.norm(long_axis)) == 0.0:
+ raise ValueError("PCA long axis is degenerate.")
+
+ axis_angle = float(np.arctan2(long_axis[1], long_axis[0]))
+ rotation_angle = GeometryManager._minimal_angle_to_align_axis(
+ axis_angle, np.pi / 2.0
+ )
+ rotation = GeometryManager._z_axis_rotation_transform(rotation_angle)
+ aligned = request.mesh.copy()
+ aligned.apply_transform(rotation)
+ return AlignXYLongAxisResult(
+ mesh=aligned,
+ yaw_angle_degrees=float(np.rad2deg(rotation_angle)),
+ )
+
+
+ @staticmethod
+ def _align_vector_to_axis(
+ mesh: Any,
+ *,
+ source_axis: list[float],
+ target_axis: list[float],
+ ) -> Any:
+ source = GeometryManager._normalize(
+ np.asarray(source_axis, dtype=float)
+ )
+ target = GeometryManager._normalize(
+ np.asarray(target_axis, dtype=float)
+ )
+ if np.linalg.norm(source) == 0:
+ raise ValueError("source_axis must be non-zero.")
+ if np.linalg.norm(target) == 0:
+ raise ValueError("target_axis must be non-zero.")
+
+ transform = GeometryManager._rotation_transform_between_vectors(
+ source, target
+ )
+ aligned = mesh.copy()
+ aligned.apply_transform(transform)
+ return aligned
+
+
+ @staticmethod
+ def _find_support_plane_candidates(
+ mesh: Any,
+ *,
+ normal_angle_tol_deg: float = 8.0,
+ plane_distance_tol: float | None = None,
+ min_area_ratio: float = 0.02,
+ max_candidates: int = 24,
+ ) -> list[SupportPlaneCandidate]:
+ GeometryManager._validate_mesh(mesh)
+
+ normals = np.asarray(mesh.face_normals, dtype=float)
+ centers = np.asarray(mesh.triangles_center, dtype=float)
+ areas = np.asarray(mesh.area_faces, dtype=float)
+ vertices = np.asarray(mesh.vertices, dtype=float)
+ total_area = float(np.sum(areas))
+ if total_area <= 0:
+ raise ValueError("Mesh has no positive face area.")
+
+ if plane_distance_tol is None:
+ extent = float(
+ np.linalg.norm(np.asarray(mesh.extents, dtype=float))
+ )
+ plane_distance_tol = max(extent * 0.01, 1e-4)
+
+ cos_tol = float(np.cos(np.deg2rad(normal_angle_tol_deg)))
+ min_area = total_area * min_area_ratio
+ order = np.argsort(-areas)
+ used = np.zeros(len(areas), dtype=bool)
+ candidates: list[SupportPlaneCandidate] = []
+
+ for seed_index in order:
+ if used[seed_index]:
+ continue
+ seed_normal = GeometryManager._normalize(normals[seed_index])
+ if np.linalg.norm(seed_normal) == 0:
+ used[seed_index] = True
+ continue
+
+ seed_center = centers[seed_index]
+ seed_offset = float(np.dot(seed_normal, seed_center))
+ normal_match = normals @ seed_normal >= cos_tol
+ offsets = centers @ seed_normal
+ plane_match = np.abs(offsets - seed_offset) <= plane_distance_tol
+ face_mask = normal_match & plane_match & ~used
+ face_indices = np.flatnonzero(face_mask)
+ if len(face_indices) == 0:
+ used[seed_index] = True
+ continue
+
+ used[face_indices] = True
+ area = float(np.sum(areas[face_indices]))
+ if area < min_area:
+ continue
+
+ weighted_normal = GeometryManager._normalize(
+ np.sum(
+ normals[face_indices] * areas[face_indices, None], axis=0
+ ),
+ )
+ center = (
+ np.sum(
+ centers[face_indices] * areas[face_indices, None], axis=0
+ )
+ / area
+ )
+ candidate = GeometryManager._build_candidate(
+ normal=weighted_normal,
+ center=center,
+ area=area,
+ face_indices=face_indices,
+ vertices=vertices,
+ )
+ candidates.append(candidate)
+
+ candidates.sort(key=lambda c: c.score, reverse=True)
+ return candidates[:max_candidates]
+
+ @staticmethod
+ def _select_tabletop_plane(
+ candidates: list[SupportPlaneCandidate],
+ ) -> SupportPlaneCandidate:
+ if not candidates:
+ raise ValueError("No support-plane candidates were found.")
+ return max(candidates, key=lambda c: c.score)
+
+ @staticmethod
+ def _orient_plane_normal(
+ mesh: Any,
+ *,
+ plane_normal: list[float],
+ plane_center: list[float],
+ ) -> list[float]:
+ GeometryManager._validate_mesh(mesh)
+
+ normal = GeometryManager._normalize(
+ np.asarray(plane_normal, dtype=float)
+ )
+ center = np.asarray(plane_center, dtype=float)
+ if np.linalg.norm(normal) == 0:
+ raise ValueError("plane_normal must be non-zero.")
+
+ vertices = np.asarray(mesh.vertices, dtype=float)
+ signed_distances = (vertices - center) @ normal
+ positive_mask = signed_distances > 1e-6
+ negative_mask = signed_distances < -1e-6
+ positive_score = float(np.sum(np.abs(signed_distances[positive_mask])))
+ negative_score = float(np.sum(np.abs(signed_distances[negative_mask])))
+
+ if positive_score > negative_score:
+ normal = -normal
+ return [float(v) for v in normal]
+
+ @staticmethod
+ def _build_candidate(
+ *,
+ normal: Any,
+ center: Any,
+ area: float,
+ face_indices: Any,
+ vertices: Any,
+ ) -> SupportPlaneCandidate:
+ signed_distances = (vertices - center) @ normal
+ below_mask = signed_distances < -1e-6
+ above_mask = signed_distances > 1e-6
+ below_count = int(np.count_nonzero(below_mask))
+ above_count = int(np.count_nonzero(above_mask))
+ below_score = float(np.sum(np.abs(signed_distances[below_mask])))
+ above_score = float(np.sum(np.abs(signed_distances[above_mask])))
+
+ smaller_score = min(below_score, above_score)
+ larger_score = max(below_score, above_score)
+ asymmetry_score = min(
+ (larger_score + 1e-9) / (smaller_score + 1e-9), 10.0
+ )
+ score = float(area * asymmetry_score)
+ return SupportPlaneCandidate(
+ normal=[float(v) for v in normal],
+ center=[float(v) for v in center],
+ area=area,
+ face_indices=[int(i) for i in face_indices],
+ below_vertex_count=below_count,
+ above_vertex_count=above_count,
+ below_area_score=below_score,
+ above_area_score=above_score,
+ score=score,
+ )
+
+
+ @staticmethod
+ def _select_xy_vertices(
+ mesh: Any,
+ vertices: Any,
+ face_indices: list[int] | None,
+ ) -> Any:
+ if face_indices is None:
+ return vertices[:, :2]
+
+ faces = np.asarray(mesh.faces, dtype=int)
+ selected_faces = faces[np.asarray(face_indices, dtype=int)]
+ selected_vertex_indices = np.unique(selected_faces.reshape(-1))
+ return vertices[selected_vertex_indices, :2]
+
+ @staticmethod
+ def _minimal_angle_to_align_axis(
+ source_angle: float, target_angle: float
+ ) -> float:
+ candidates = [
+ GeometryManager._wrap_to_pi(target_angle - source_angle),
+ GeometryManager._wrap_to_pi(
+ target_angle + 3.141592653589793 - source_angle
+ ),
+ ]
+ return min(candidates, key=abs)
+
+ @staticmethod
+ def _wrap_to_pi(angle: float) -> float:
+ two_pi = 2.0 * 3.141592653589793
+ return (angle + 3.141592653589793) % two_pi - 3.141592653589793
+
+ @staticmethod
+ def _z_axis_rotation_transform(angle: float) -> Any:
+ c = float(np.cos(angle))
+ s = float(np.sin(angle))
+ transform = np.eye(4)
+ transform[:3, :3] = np.array(
+ [
+ [c, -s, 0.0],
+ [s, c, 0.0],
+ [0.0, 0.0, 1.0],
+ ],
+ dtype=float,
+ )
+ return transform
+
+
+ @staticmethod
+ def _rotation_transform_between_vectors(
+ source: Any, target: Any
+ ) -> Any:
+ dot = float(np.clip(np.dot(source, target), -1.0, 1.0))
+ transform = np.eye(4)
+ if dot > 1.0 - 1e-8:
+ return transform
+ if dot < -1.0 + 1e-8:
+ axis = GeometryManager._orthogonal_axis(source)
+ rotation = GeometryManager._axis_angle_rotation(axis, np.pi)
+ else:
+ axis = GeometryManager._normalize(np.cross(source, target))
+ angle = float(np.arccos(dot))
+ rotation = GeometryManager._axis_angle_rotation(axis, angle)
+ transform[:3, :3] = rotation
+ return transform
+
+ @staticmethod
+ def _axis_angle_rotation(axis: Any, angle: float) -> Any:
+ axis = GeometryManager._normalize(axis)
+ x, y, z = axis
+ c = float(np.cos(angle))
+ s = float(np.sin(angle))
+ one_c = 1.0 - c
+ return np.array(
+ [
+ [c + x * x * one_c, x * y * one_c - z * s, x * z * one_c + y * s],
+ [y * x * one_c + z * s, c + y * y * one_c, y * z * one_c - x * s],
+ [z * x * one_c - y * s, z * y * one_c + x * s, c + z * z * one_c],
+ ],
+ dtype=float,
+ )
+
+ @staticmethod
+ def _orthogonal_axis(vector: Any) -> Any:
+ axis = np.array([1.0, 0.0, 0.0])
+ if abs(float(np.dot(vector, axis))) > 0.9:
+ axis = np.array([0.0, 1.0, 0.0])
+ return GeometryManager._normalize(np.cross(vector, axis))
+
+ @staticmethod
+ def _normalize(vector: Any) -> Any:
+ norm = float(np.linalg.norm(vector))
+ if norm == 0.0:
+ return vector
+ return vector / norm
+
+ @staticmethod
+ def _mean_abs_log_ratio_error(lhs: Any, rhs: Any) -> float:
+ eps = 1.0e-6
+ lhs = np.maximum(np.asarray(lhs, dtype=np.float64), eps)
+ rhs = np.maximum(np.asarray(rhs, dtype=np.float64), eps)
+ return float(np.mean(np.abs(np.log(lhs / rhs))))
+
+ @staticmethod
+ def _validate_mesh(mesh: Any) -> None:
+ if not hasattr(mesh, "vertices") or not hasattr(mesh, "faces"):
+ raise ValueError("Loaded geometry is not a mesh.")
+ if len(mesh.vertices) == 0 or len(mesh.faces) == 0:
+ raise ValueError("Mesh must contain vertices and faces.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py
new file mode 100644
index 000000000..f001720fc
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/schemas.py
@@ -0,0 +1,201 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+ "AlignToAxisRequest",
+ "AlignToAxisResult",
+ "AlignXYLongAxisRequest",
+ "AlignXYLongAxisResult",
+ "CenterMeshRequest",
+ "NormalizeRequest",
+ "NormalizeResult",
+ "CenterMeshResult",
+ "ConvertUpAxisRequest",
+ "ConvertUpAxisResult",
+ "DetectTabletopRequest",
+ "DetectTabletopResult",
+ "ExportMeshRequest",
+ "ExportMeshResult",
+ "LoadMeshRequest",
+ "LoadMeshResult",
+ "PlaceAbovePlaneRequest",
+ "PlaceAbovePlaneResult",
+ "SupportPlaneCandidate",
+]
+
+
+@dataclass(frozen=True)
+class SupportPlaneCandidate:
+ """Candidate planar tabletop support surface."""
+
+ normal: list[float]
+ center: list[float]
+ area: float
+ face_indices: list[int]
+ below_vertex_count: int
+ above_vertex_count: int
+ below_area_score: float
+ above_area_score: float
+ score: float
+
+
+@dataclass(frozen=True)
+class LoadMeshRequest:
+ """Request to load a GLB/mesh file."""
+
+ mesh_path: Path
+
+
+@dataclass(frozen=True)
+class LoadMeshResult:
+ """Result of loading a mesh file."""
+
+ mesh: Any
+
+
+@dataclass(frozen=True)
+class ExportMeshRequest:
+ """Request to export a mesh to a file."""
+
+ mesh: Any
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class ExportMeshResult:
+ """Result of exporting a mesh."""
+
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class ConvertUpAxisRequest:
+ """Request to convert a mesh from one up-axis convention to another."""
+
+ mesh: Any
+ input_up_axis: list[float] | None = None
+ output_up_axis: list[float] | None = None
+
+
+@dataclass(frozen=True)
+class ConvertUpAxisResult:
+ """Result of converting a mesh up-axis."""
+
+ mesh: Any
+
+
+@dataclass(frozen=True)
+class CenterMeshRequest:
+ """Request to center a mesh by its bounding-box center."""
+
+ mesh: Any
+
+
+@dataclass(frozen=True)
+class CenterMeshResult:
+ """Result of centering a mesh."""
+
+ mesh: Any
+ bbox_center: list[float]
+
+
+@dataclass(frozen=True)
+class AlignToAxisRequest:
+ """Request to rotate a mesh so a source axis aligns to a target axis."""
+
+ mesh: Any
+ source_axis: list[float]
+ target_axis: list[float]
+
+
+@dataclass(frozen=True)
+class AlignToAxisResult:
+ """Result of aligning a mesh vector to an axis."""
+
+ mesh: Any
+
+
+@dataclass(frozen=True)
+class PlaceAbovePlaneRequest:
+ """Request to translate a mesh so its AABB bottom sits above the XY plane."""
+
+ mesh: Any
+ clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class PlaceAbovePlaneResult:
+ """Result of placing a mesh above the XY plane."""
+
+ mesh: Any
+
+
+@dataclass(frozen=True)
+class DetectTabletopRequest:
+ """Request to detect the most likely tabletop plane in a mesh."""
+
+ mesh: Any
+ normal_angle_tol_deg: float = 8.0
+ plane_distance_tol: float | None = None
+ min_area_ratio: float = 0.02
+ max_candidates: int = 24
+
+
+@dataclass(frozen=True)
+class DetectTabletopResult:
+ """Result of detecting the tabletop plane with oriented normal."""
+
+ selected: SupportPlaneCandidate
+ oriented_normal: list[float]
+ candidates: list[SupportPlaneCandidate]
+
+
+@dataclass(frozen=True)
+class AlignXYLongAxisRequest:
+ """Request to align a mesh XY long axis to the Y axis via PCA."""
+
+ mesh: Any
+ face_indices: list[int] | None = None
+
+
+@dataclass(frozen=True)
+class AlignXYLongAxisResult:
+ """Result of PCA yaw alignment."""
+
+ mesh: Any
+ yaw_angle_degrees: float
+
+
+@dataclass(frozen=True)
+class NormalizeRequest:
+ """Request to normalize a mesh to a target size."""
+
+ mesh: Any
+ target_size: float = 1.0
+
+
+@dataclass(frozen=True)
+class NormalizeResult:
+ """Result of normalizing a mesh."""
+
+ mesh: Any
+ scale_factor: float
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/utils.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/utils.py
new file mode 100644
index 000000000..f7a5bcfec
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/geometry_manager/utils.py
@@ -0,0 +1,581 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ DetectTabletopRequest,
+ GeometryManager,
+)
+
+__all__ = [
+ "_compose_json_matrices",
+ "_compose_simready_to_aligned_matrix",
+ "_decompose_transform_matrix",
+ "_aabb_bottom_to_xy_plane_transform",
+ "_aabb_center",
+ "_compose_sam3d_multi_object_transform",
+ "_copy_scene_with_transform",
+ "_estimate_support_normal",
+ "_glb_to_sam3d_local_matrix",
+ "_load_scene_with_transform",
+ "_matrix_from_json",
+ "_quaternion_wxyz_to_matrix",
+ "_rotation_between_vectors",
+ "_row_linear_to_trimesh_matrix",
+ "_scale_transform",
+ "_scene_to_mesh",
+ "_support_normal_flip_transform",
+ "_transform_point",
+ "_validate_vector",
+ "_xy_aabb_center",
+ "_xy_aabb_size",
+ "_z_up_to_glb_y_up_transform",
+ "_z_yaw_transform",
+]
+
+
+def _compose_json_matrices(*values: Any) -> list[list[float]]:
+ matrices = [np.asarray(value, dtype=np.float64) for value in values]
+ if any(matrix.shape != (4, 4) for matrix in matrices):
+ return []
+ result = np.eye(4, dtype=np.float64)
+ for matrix in matrices:
+ result = result @ matrix
+ return result.tolist()
+
+
+def _compose_simready_to_aligned_matrix(
+ *, raw_to_aligned_matrix: Any, raw_to_simready_matrix: Any
+) -> list[list[float]]:
+ raw_to_aligned = np.asarray(raw_to_aligned_matrix, dtype=np.float64)
+ raw_to_simready = np.asarray(raw_to_simready_matrix, dtype=np.float64)
+ if raw_to_aligned.shape != (4, 4) or raw_to_simready.shape != (4, 4):
+ return []
+ try:
+ return (raw_to_aligned @ np.linalg.inv(raw_to_simready)).tolist()
+ except np.linalg.LinAlgError:
+ return []
+
+
+def _decompose_transform_matrix(matrix_value: Any) -> dict[str, Any]:
+ matrix = np.asarray(matrix_value, dtype=np.float64)
+ if matrix.shape != (4, 4):
+ return {"translation": [], "rotation_matrix": [], "scale": []}
+ linear = matrix[:3, :3]
+ scale = np.linalg.norm(linear, axis=0)
+ rotation = np.eye(3, dtype=np.float64)
+ for index in range(3):
+ if scale[index] > 1.0e-12:
+ rotation[:, index] = linear[:, index] / scale[index]
+ return {
+ "translation": matrix[:3, 3].tolist(),
+ "rotation_matrix": rotation.tolist(),
+ "scale": scale.tolist(),
+ }
+
+
+def _support_normal_flip_transform(
+ *,
+ support_normal: np.ndarray,
+ normal_alignment: np.ndarray,
+) -> np.ndarray:
+ flipped_normal_alignment = _rotation_between_vectors(
+ -support_normal,
+ np.array([0.0, 0.0, 1.0], dtype=np.float64),
+ )
+ return flipped_normal_alignment @ np.linalg.inv(normal_alignment)
+
+
+def _z_yaw_transform(yaw_degrees: float) -> np.ndarray:
+ angle = np.deg2rad(yaw_degrees)
+ c = float(np.cos(angle))
+ s = float(np.sin(angle))
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, :3] = np.array(
+ [
+ [c, -s, 0.0],
+ [s, c, 0.0],
+ [0.0, 0.0, 1.0],
+ ],
+ dtype=np.float64,
+ )
+ return transform
+
+
+def _z_up_to_glb_y_up_transform() -> np.ndarray:
+ return _rotation_between_vectors(
+ np.array([0.0, 0.0, 1.0], dtype=np.float64),
+ np.array([0.0, 1.0, 0.0], dtype=np.float64),
+ )
+
+
+def _copy_scene_with_transform(scene: Any, transform: np.ndarray) -> Any:
+ copied = scene.copy()
+ copied.apply_transform(transform)
+ return copied
+
+
+def _matrix_from_json(value: Any, *, name: str) -> np.ndarray:
+ matrix = np.asarray(value, dtype=np.float64)
+ if matrix.shape != (4, 4):
+ raise ValueError(f"{name} must be a 4x4 matrix.")
+ return matrix
+
+
+def _load_scene_with_transform(
+ *,
+ path: Path,
+ transform: np.ndarray,
+ trimesh: Any,
+) -> Any:
+ scene = trimesh.load(path, force="scene")
+ scene.apply_transform(transform)
+ return scene
+
+
+def _scene_to_mesh(scene: Any, *, trimesh: Any) -> Any:
+ if isinstance(scene, trimesh.Trimesh):
+ return scene
+ dumped = scene.dump(concatenate=True)
+ if isinstance(dumped, trimesh.Trimesh):
+ return dumped
+ meshes = [item for item in dumped if isinstance(item, trimesh.Trimesh)]
+ if not meshes:
+ raise ValueError("Scene contains no mesh geometry.")
+ return trimesh.util.concatenate(meshes)
+
+
+def _estimate_support_normal(mesh: Any) -> np.ndarray:
+ geom = GeometryManager()
+ try:
+ detect_result = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+ normal = np.asarray(detect_result.oriented_normal, dtype=np.float64)
+ norm = np.linalg.norm(normal)
+ if norm > 0.0:
+ return normal / norm
+ except Exception:
+ pass
+
+ normals = np.asarray(mesh.face_normals, dtype=np.float64)
+ areas = np.asarray(mesh.area_faces, dtype=np.float64)
+ if normals.size == 0 or areas.size == 0:
+ return np.array([0.0, 0.0, 1.0], dtype=np.float64)
+ normal = normals[int(np.argmax(areas))]
+ norm = np.linalg.norm(normal)
+ if norm == 0.0:
+ return np.array([0.0, 0.0, 1.0], dtype=np.float64)
+ return normal / norm
+
+
+def _rotation_between_vectors(source: np.ndarray, target: np.ndarray) -> np.ndarray:
+ source = source / np.linalg.norm(source)
+ target = target / np.linalg.norm(target)
+ cross = np.cross(source, target)
+ dot = float(np.clip(np.dot(source, target), -1.0, 1.0))
+ if np.linalg.norm(cross) < 1e-8:
+ if dot > 0.0:
+ return np.eye(4, dtype=np.float64)
+ axis = np.array([1.0, 0.0, 0.0], dtype=np.float64)
+ if abs(float(np.dot(source, axis))) > 0.9:
+ axis = np.array([0.0, 1.0, 0.0], dtype=np.float64)
+ cross = np.cross(source, axis)
+ axis = cross / np.linalg.norm(cross)
+ angle = float(np.arccos(dot))
+ skew = np.array(
+ [
+ [0.0, -axis[2], axis[1]],
+ [axis[2], 0.0, -axis[0]],
+ [-axis[1], axis[0], 0.0],
+ ],
+ dtype=np.float64,
+ )
+ rotation = (
+ np.eye(3, dtype=np.float64)
+ + np.sin(angle) * skew
+ + (1.0 - np.cos(angle)) * (skew @ skew)
+ )
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, :3] = rotation
+ return transform
+
+
+def _transform_point(transform: np.ndarray, point: np.ndarray) -> np.ndarray:
+ homogeneous = np.ones(4, dtype=np.float64)
+ homogeneous[:3] = point
+ return (transform @ homogeneous)[:3]
+
+
+def _aabb_center(bounds: np.ndarray) -> np.ndarray:
+ return 0.5 * (
+ np.asarray(bounds[0], dtype=np.float64)
+ + np.asarray(bounds[1], dtype=np.float64)
+ )
+
+
+def _xy_aabb_center(bounds: np.ndarray) -> np.ndarray:
+ bounds = np.asarray(bounds, dtype=np.float64)
+ return 0.5 * (bounds[0, :2] + bounds[1, :2])
+
+
+def _xy_aabb_size(bounds: np.ndarray) -> np.ndarray:
+ bounds = np.asarray(bounds, dtype=np.float64)
+ return np.maximum(bounds[1, :2] - bounds[0, :2], 1e-6)
+
+
+def _aabb_bottom_to_xy_plane_transform(bounds: np.ndarray) -> np.ndarray:
+ bounds = np.asarray(bounds, dtype=np.float64)
+ min_z = float(bounds[0][2])
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, 3] = [0.0, 0.0, -min_z]
+ return transform
+
+
+def _scale_transform(scale: float) -> np.ndarray:
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, :3] *= float(scale)
+ return transform
+
+
+def _compose_sam3d_multi_object_transform(
+ *,
+ rotation_quaternion_wxyz: list[float],
+ translation: list[float],
+ scale: list[float],
+) -> np.ndarray:
+ """Compose the transform equivalent to the old baked multi-object export."""
+ rotation = _quaternion_wxyz_to_matrix(rotation_quaternion_wxyz)
+ scale_matrix = np.diag(_validate_vector(scale, expected_len=3, name="scale"))
+ linear_row = _glb_to_sam3d_local_matrix() @ scale_matrix @ rotation
+ return _row_linear_to_trimesh_matrix(
+ linear_row=linear_row,
+ translation=translation,
+ )
+
+
+def _row_linear_to_trimesh_matrix(
+ *,
+ linear_row: np.ndarray,
+ translation: list[float],
+) -> np.ndarray:
+ """Convert a row-vector linear transform to trimesh's 4x4 matrix format."""
+ translation_vector = _validate_vector(
+ translation,
+ expected_len=3,
+ name="translation",
+ )
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, :3] = linear_row.T
+ transform[:3, 3] = translation_vector
+ return transform
+
+
+def _validate_vector(
+ values: list[float],
+ *,
+ expected_len: int,
+ name: str,
+) -> np.ndarray:
+ """Validate and convert a numeric vector."""
+ if len(values) != expected_len:
+ raise ValueError(f"{name} must have {expected_len} values")
+ return np.asarray(values, dtype=np.float64)
+
+
+def _glb_to_sam3d_local_matrix() -> np.ndarray:
+ """Return the basis conversion used by the old baked multi-object exporter."""
+ return np.array(
+ [
+ [1.0, 0.0, 0.0],
+ [0.0, 0.0, 1.0],
+ [0.0, -1.0, 0.0],
+ ],
+ dtype=np.float64,
+ )
+
+
+def _quaternion_wxyz_to_matrix(quaternion: list[float]) -> np.ndarray:
+ """Convert a wxyz quaternion to a 3x3 rotation matrix."""
+ if len(quaternion) != 4:
+ raise ValueError("rotation_quaternion_wxyz must have 4 values")
+ w, x, y, z = [float(v) for v in quaternion]
+ norm = np.sqrt(w * w + x * x + y * y + z * z)
+ if norm == 0.0:
+ raise ValueError("rotation quaternion must be non-zero")
+ w, x, y, z = w / norm, x / norm, y / norm, z / norm
+ return np.array(
+ [
+ [
+ 1.0 - 2.0 * (y * y + z * z),
+ 2.0 * (x * y - z * w),
+ 2.0 * (x * z + y * w),
+ ],
+ [
+ 2.0 * (x * y + z * w),
+ 1.0 - 2.0 * (x * x + z * z),
+ 2.0 * (y * z - x * w),
+ ],
+ [
+ 2.0 * (x * z - y * w),
+ 2.0 * (y * z + x * w),
+ 1.0 - 2.0 * (x * x + y * y),
+ ],
+ ],
+ dtype=np.float64,
+ )
+
+
+def _detect_table_fit_support_quad(
+ mesh: Any,
+ *,
+ target_aspect: float,
+) -> dict[str, Any]:
+ geom = GeometryManager()
+ detect = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+ faces = np.asarray(mesh.faces, dtype=np.int64)
+ vertices = np.asarray(mesh.vertices, dtype=np.float64)
+ support_vertices = vertices[
+ np.unique(faces[np.asarray(detect.selected.face_indices, dtype=np.int64)])
+ ]
+ hull_xy = _table_fit_convex_hull_2d(support_vertices[:, :2])
+ quad = _largest_centered_table_fit_inscribed_rect(
+ hull_xy,
+ target_aspect=max(float(target_aspect), 1.0e-6),
+ )
+ center_z = float(np.mean(support_vertices[:, 2]))
+ return {
+ "method": "sampled_centered_inscribed_rectangle_on_support_convex_hull",
+ "normal": detect.oriented_normal,
+ "area": float(detect.selected.area),
+ "center": [quad["center_xy"][0], quad["center_xy"][1], center_z],
+ "center_xy": quad["center_xy"],
+ "size_xy": quad["size_xy"],
+ "yaw_radians": quad["yaw_radians"],
+ "yaw_degrees": float(np.rad2deg(quad["yaw_radians"])),
+ "corners_xy": quad["corners_xy"],
+ "support_hull_xy": hull_xy.tolist(),
+ }
+
+
+def _largest_centered_table_fit_inscribed_rect(
+ hull_xy: np.ndarray,
+ *,
+ target_aspect: float,
+ yaw_samples: int = 180,
+) -> dict[str, Any]:
+ if hull_xy.shape[0] < 3:
+ raise ValueError("Support hull must contain at least 3 points.")
+ best: dict[str, Any] | None = None
+ centers = [
+ np.mean(hull_xy, axis=0),
+ 0.5 * (np.min(hull_xy, axis=0) + np.max(hull_xy, axis=0)),
+ ]
+ for yaw in np.linspace(0.0, np.pi, yaw_samples, endpoint=False):
+ rot = _table_fit_rot2(-yaw)
+ inv_rot = _table_fit_rot2(yaw)
+ rotated_hull = hull_xy @ rot.T
+ for center_world in centers:
+ center = center_world @ rot.T
+ lo = 0.0
+ bbox_size = np.max(rotated_hull, axis=0) - np.min(rotated_hull, axis=0)
+ hi = float(max(bbox_size[0] / target_aspect, bbox_size[1], 1.0e-6))
+ for _ in range(40):
+ mid = 0.5 * (lo + hi)
+ width = target_aspect * mid
+ depth = mid
+ corners = _table_fit_rect_corners(
+ center=center,
+ width=width,
+ depth=depth,
+ )
+ corners_world = corners @ inv_rot.T
+ if all(
+ _table_fit_point_in_convex_polygon(point, hull_xy)
+ for point in corners_world
+ ):
+ lo = mid
+ else:
+ hi = mid
+ width = target_aspect * lo
+ depth = lo
+ area = width * depth
+ corners_world = (
+ _table_fit_rect_corners(center=center, width=width, depth=depth)
+ @ inv_rot.T
+ )
+ candidate = {
+ "center_xy": center_world.tolist(),
+ "size_xy": [float(width), float(depth)],
+ "yaw_radians": float(yaw),
+ "corners_xy": corners_world.tolist(),
+ "area": float(area),
+ }
+ if best is None or area > float(best["area"]):
+ best = candidate
+ if best is None:
+ raise ValueError("Failed to estimate an inscribed support rectangle.")
+ return best
+
+
+def _load_table_fit_scene_internal_z(
+ path: Path,
+ *,
+ trimesh: Any,
+ y_to_z: np.ndarray,
+) -> Any:
+ if not path.is_file():
+ raise FileNotFoundError(f"GLB not found: {path}")
+ scene = trimesh.load(path, force="scene")
+ scene.apply_transform(y_to_z)
+ return scene
+
+
+def _table_fit_scene_union_bounds(scenes: list[Any], *, trimesh: Any) -> np.ndarray:
+ bounds = [
+ np.asarray(_scene_to_mesh(scene, trimesh=trimesh).bounds, dtype=np.float64)
+ for scene in scenes
+ ]
+ return np.vstack(
+ [
+ np.vstack([item[0] for item in bounds]).min(axis=0),
+ np.vstack([item[1] for item in bounds]).max(axis=0),
+ ]
+ )
+
+
+def _table_fit_bounds_xy_manifest(
+ bounds: np.ndarray,
+ *,
+ unit_scale: float,
+) -> dict[str, Any]:
+ min_xy = bounds[0, :2] * unit_scale
+ max_xy = bounds[1, :2] * unit_scale
+ size_xy = max_xy - min_xy
+ center_xy = 0.5 * (min_xy + max_xy)
+ return {
+ "unit": "cm",
+ "min_xy": min_xy.tolist(),
+ "max_xy": max_xy.tolist(),
+ "center_xy": center_xy.tolist(),
+ "size_xy": size_xy.tolist(),
+ "area": float(size_xy[0] * size_xy[1]),
+ }
+
+
+def _table_fit_uniform_xy_scale_transform(
+ *,
+ center_xy: np.ndarray,
+ scale: float,
+) -> np.ndarray:
+ center = np.eye(4, dtype=np.float64)
+ center[:3, 3] = [float(center_xy[0]), float(center_xy[1]), 0.0]
+ uncenter = np.eye(4, dtype=np.float64)
+ uncenter[:3, 3] = [-float(center_xy[0]), -float(center_xy[1]), 0.0]
+ scale_mat = np.eye(4, dtype=np.float64)
+ scale_mat[0, 0] = float(scale)
+ scale_mat[1, 1] = float(scale)
+ return center @ scale_mat @ uncenter
+
+
+def _table_fit_uniform_scale_transform(
+ *,
+ center_xy: np.ndarray,
+ scale: float,
+) -> np.ndarray:
+ center = np.eye(4, dtype=np.float64)
+ center[:3, 3] = [float(center_xy[0]), float(center_xy[1]), 0.0]
+ uncenter = np.eye(4, dtype=np.float64)
+ uncenter[:3, 3] = [-float(center_xy[0]), -float(center_xy[1]), 0.0]
+ scale_mat = np.eye(4, dtype=np.float64)
+ scale_mat[:3, :3] *= float(scale)
+ return center @ scale_mat @ uncenter
+
+
+def _table_fit_safe_positive_ratio(numerator: float, denominator: float) -> float:
+ return max(float(numerator) / max(float(denominator), 1.0e-6), 1.0e-6)
+
+
+def _table_fit_convex_hull_2d(points: np.ndarray) -> np.ndarray:
+ unique = sorted({(float(x), float(y)) for x, y in np.asarray(points)[:, :2]})
+ if len(unique) <= 1:
+ return np.asarray(unique, dtype=np.float64)
+
+ def cross(
+ o: tuple[float, float],
+ a: tuple[float, float],
+ b: tuple[float, float],
+ ) -> float:
+ return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
+
+ lower: list[tuple[float, float]] = []
+ for point in unique:
+ while len(lower) >= 2 and cross(lower[-2], lower[-1], point) <= 0.0:
+ lower.pop()
+ lower.append(point)
+ upper: list[tuple[float, float]] = []
+ for point in reversed(unique):
+ while len(upper) >= 2 and cross(upper[-2], upper[-1], point) <= 0.0:
+ upper.pop()
+ upper.append(point)
+ return np.asarray(lower[:-1] + upper[:-1], dtype=np.float64)
+
+
+def _table_fit_point_in_convex_polygon(
+ point: np.ndarray,
+ polygon: np.ndarray,
+) -> bool:
+ previous = 0.0
+ for index in range(len(polygon)):
+ a = polygon[index]
+ b = polygon[(index + 1) % len(polygon)]
+ cross = float(np.cross(b - a, point - a))
+ if abs(cross) < 1.0e-9:
+ continue
+ if previous == 0.0:
+ previous = cross
+ elif cross * previous < -1.0e-9:
+ return False
+ return True
+
+
+def _table_fit_rect_corners(
+ *,
+ center: np.ndarray,
+ width: float,
+ depth: float,
+) -> np.ndarray:
+ half_w = 0.5 * float(width)
+ half_d = 0.5 * float(depth)
+ return np.asarray(
+ [
+ [center[0] - half_w, center[1] - half_d],
+ [center[0] + half_w, center[1] - half_d],
+ [center[0] + half_w, center[1] + half_d],
+ [center[0] - half_w, center[1] + half_d],
+ ],
+ dtype=np.float64,
+ )
+
+
+def _table_fit_rot2(angle: float) -> np.ndarray:
+ c = float(np.cos(angle))
+ s = float(np.sin(angle))
+ return np.asarray([[c, -s], [s, c]], dtype=np.float64)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py
new file mode 100644
index 000000000..c7a200a51
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/__init__.py
@@ -0,0 +1,35 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.manager import (
+ ASSET_IMAGE_PROMPT_SUFFIX,
+ ImageGenerationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.schemas import (
+ ImageGenerationRequest,
+ ImageGenerationResult,
+ TextToAssetImageRequest,
+)
+
+__all__ = [
+ "ASSET_IMAGE_PROMPT_SUFFIX",
+ "ImageGenerationManager",
+ "ImageGenerationRequest",
+ "ImageGenerationResult",
+ "TextToAssetImageRequest",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py
new file mode 100644
index 000000000..6406f74d3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/manager.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_generation_client import (
+ ImageGenerationClient,
+ ImageGenerationError,
+ ImageGenerationServerRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager.schemas import (
+ ImageGenerationRequest,
+ ImageGenerationResult,
+ TextToAssetImageRequest,
+)
+
+ASSET_IMAGE_PROMPT_SUFFIX = (
+ "single isolated object, centered, fully visible, "
+ "on a high contrast colored background. "
+)
+
+
+class ImageGenerationManager:
+ """Image generation domain operations."""
+
+ def __init__(self, *, client: ImageGenerationClient | None = None) -> None:
+ self.client = client or ImageGenerationClient()
+
+ def generate_image(self, request: ImageGenerationRequest) -> ImageGenerationResult:
+ output_path = request.output_path.expanduser().resolve()
+ response = self.client.generate(
+ ImageGenerationServerRequest(
+ prompt=request.prompt,
+ output_path=output_path,
+ ),
+ )
+ if isinstance(response, ImageGenerationError):
+ raise RuntimeError(response.error_message)
+
+ return ImageGenerationResult(
+ image_path=Path(response.result.image_path).expanduser().resolve(),
+ )
+
+ def generate_asset_image_from_text(
+ self,
+ request: TextToAssetImageRequest,
+ ) -> Path:
+ prompt = _build_asset_image_prompt(request.prompt)
+ result = self.generate_image(
+ ImageGenerationRequest(prompt=prompt, output_path=request.output_path)
+ )
+ return result.image_path
+
+
+def _build_asset_image_prompt(prompt: str) -> str:
+ prompt = prompt.strip()
+ if not prompt:
+ raise ValueError("Text-to-asset image prompt must be non-empty.")
+ if ASSET_IMAGE_PROMPT_SUFFIX in prompt:
+ return prompt
+ return f"{prompt}, {ASSET_IMAGE_PROMPT_SUFFIX}"
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py
new file mode 100644
index 000000000..ac4a9cd7e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_generation_manager/schemas.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class TextToAssetImageRequest:
+ """Request for generating an asset image from a text prompt."""
+
+ prompt: str
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageGenerationRequest:
+ """Request for generating one image from text."""
+
+ prompt: str
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageGenerationResult:
+ """Generated image path."""
+
+ image_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py
new file mode 100644
index 000000000..fbbf31487
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/__init__.py
@@ -0,0 +1,33 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.manager import (
+ ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.schemas import (
+ AssetImageToRgbaRequest,
+ ImageSegmentationRequest,
+ ImageSegmentationResult,
+)
+
+__all__ = [
+ "AssetImageToRgbaRequest",
+ "ImageSegmentationManager",
+ "ImageSegmentationRequest",
+ "ImageSegmentationResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py
new file mode 100644
index 000000000..052b8d7db
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/manager.py
@@ -0,0 +1,90 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+ ImageSegmentationClient,
+ ImageSegmentationError,
+ ImageSegmentationServerRequest,
+ apply_mask_to_alpha,
+ decode_rle_mask,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager.schemas import (
+ AssetImageToRgbaRequest,
+ ImageSegmentationRequest,
+ ImageSegmentationResult,
+)
+
+
+class ImageSegmentationManager:
+ """Image segmentation domain operations."""
+
+ def __init__(self, *, client: ImageSegmentationClient | None = None) -> None:
+ self.client = client or ImageSegmentationClient()
+
+ def segment_image(
+ self,
+ request: ImageSegmentationRequest,
+ ) -> ImageSegmentationResult:
+ image_path = request.image_path.expanduser().resolve()
+ _validate_segment_request(image_path=image_path, prompt=request.prompt)
+
+ response = self.client.segment(
+ ImageSegmentationServerRequest(
+ prompt=request.prompt.strip(),
+ image_path=image_path,
+ ),
+ )
+ if isinstance(response, ImageSegmentationError):
+ raise RuntimeError(response.error_message)
+
+ return ImageSegmentationResult(candidates=list(response.result.candidates))
+
+ def convert_asset_image_to_rgba(
+ self,
+ request: AssetImageToRgbaRequest,
+ ) -> Path:
+ segmentation_result = self.segment_image(
+ ImageSegmentationRequest(
+ image_path=request.image_path,
+ prompt=request.prompt,
+ )
+ )
+ if not segmentation_result.candidates:
+ raise ValueError("Image segmentation returned no candidates.")
+
+ candidate = segmentation_result.candidates[0]
+ if candidate.mask_rle is None:
+ raise ValueError(f"Candidate {candidate.candidate_id} has no mask_rle.")
+
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ mask = decode_rle_mask(candidate.mask_rle)
+ rgba = apply_mask_to_alpha(request.image_path, mask)
+ rgba.save(output_path)
+ if not output_path.is_file():
+ raise FileNotFoundError(f"RGBA image was not written: {output_path}")
+ return output_path
+
+
+def _validate_segment_request(*, image_path: Path, prompt: str) -> None:
+ if not image_path.is_file():
+ raise FileNotFoundError(f"Image segmentation input not found: {image_path}")
+ if not prompt.strip():
+ raise ValueError("Image segmentation prompt must be non-empty.")
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py
new file mode 100644
index 000000000..d59b7e7a9
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/image_segmentation_manager/schemas.py
@@ -0,0 +1,48 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+ ImageSegmentationCandidate,
+)
+
+
+@dataclass(frozen=True)
+class AssetImageToRgbaRequest:
+ """Request for converting an asset image to an RGBA cutout."""
+
+ image_path: Path
+ prompt: str
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class ImageSegmentationRequest:
+ """Request for segmenting one image with one text prompt."""
+
+ image_path: Path
+ prompt: str
+
+
+@dataclass(frozen=True)
+class ImageSegmentationResult:
+ """Segmentation candidates."""
+
+ candidates: list[ImageSegmentationCandidate]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/__init__.py
new file mode 100644
index 000000000..8d0684d85
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/__init__.py
@@ -0,0 +1,23 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.manager import (
+ LayoutManager,
+)
+
+__all__ = ["LayoutManager"]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/manager.py
new file mode 100644
index 000000000..98bd0ad14
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/manager.py
@@ -0,0 +1,85 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.scene_edit_sa import (
+ optimize_scene_edit_layout_with_sa_node3_5,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.scene_edit_support import (
+ build_xy_footprint,
+ clamp_center_to_support_region,
+ compute_simready_glb_xy_size,
+ support_region_default_center,
+ support_region_grid_center,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.utils import (
+ _layout_text_objects_grid,
+ _object_scenes_xy_aabb_manifest,
+ _optimize_text_layout_slp,
+ _settle_and_pack_object_footprints,
+)
+
+
+class LayoutManager:
+ """Public API for layout planning and footprint analysis.
+
+ Tools should compose these methods instead of importing private helpers from
+ ``layout_manager.utils`` directly. The utils module remains an internal
+ implementation detail for shared math and optimization routines.
+ """
+
+ @staticmethod
+ def layout_text_objects_grid(**kwargs: Any) -> Any:
+ return _layout_text_objects_grid(**kwargs)
+
+ @staticmethod
+ def object_scenes_xy_aabb_manifest(**kwargs: Any) -> Any:
+ return _object_scenes_xy_aabb_manifest(**kwargs)
+
+ @staticmethod
+ def optimize_text_layout_slp(**kwargs: Any) -> Any:
+ return _optimize_text_layout_slp(**kwargs)
+
+ @staticmethod
+ def settle_and_pack_object_footprints(**kwargs: Any) -> Any:
+ return _settle_and_pack_object_footprints(**kwargs)
+
+ @staticmethod
+ def optimize_scene_edit_layout_with_sa_node3_5(**kwargs: Any) -> Any:
+ return optimize_scene_edit_layout_with_sa_node3_5(**kwargs)
+
+ @staticmethod
+ def compute_simready_glb_xy_size(**kwargs: Any) -> Any:
+ return compute_simready_glb_xy_size(**kwargs)
+
+ @staticmethod
+ def build_xy_footprint(**kwargs: Any) -> Any:
+ return build_xy_footprint(**kwargs)
+
+ @staticmethod
+ def clamp_center_to_support_region(**kwargs: Any) -> Any:
+ return clamp_center_to_support_region(**kwargs)
+
+ @staticmethod
+ def support_region_default_center(**kwargs: Any) -> Any:
+ return support_region_default_center(**kwargs)
+
+ @staticmethod
+ def support_region_grid_center(**kwargs: Any) -> Any:
+ return support_region_grid_center(**kwargs)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_node3_5.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_node3_5.py
new file mode 100644
index 000000000..b46caec74
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_node3_5.py
@@ -0,0 +1,1490 @@
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.sa_state import (
+ SceneState,
+ Tempo_SceneState,
+)
+import copy
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Tuple, Any, Optional, List
+
+import numpy as np
+import trimesh
+from scipy.optimize import minimize
+
+
+def _parse_coordinate_range(
+ coordinate_range: Optional[Dict[str, Any]],
+) -> Optional[Dict[str, Optional[List[float]]]]:
+ """
+ Convert a Node 2 ``coordinate_range`` (expressed in cm) into the optimization
+ frame (expressed in meters).
+
+ Node 2 output shape (cm)::
+
+ {"x": [x_min, x_max], "y": [y_min, y_max]}
+
+ where the whole field may be null, or ``x`` / ``y`` may individually be null.
+
+ Returns (m)::
+
+ {"x": [x_min, x_max] | None, "y": [y_min, y_max] | None}
+
+ or ``None`` when no usable range is present.
+ """
+ if not isinstance(coordinate_range, dict):
+ return None
+
+ def _axis(vals) -> Optional[List[float]]:
+ if not isinstance(vals, (list, tuple)) or len(vals) < 2:
+ return None
+ try:
+ lo = float(vals[0]) * 0.01
+ hi = float(vals[1]) * 0.01
+ except (TypeError, ValueError):
+ return None
+ if lo > hi:
+ lo, hi = hi, lo
+ return [lo, hi]
+
+ x_axis = _axis(coordinate_range.get("x"))
+ y_axis = _axis(coordinate_range.get("y"))
+
+ if x_axis is None and y_axis is None:
+ return None
+ return {"x": x_axis, "y": y_axis}
+
+
+def _intersect_axis_range(
+ a: Optional[List[float]], b: Optional[List[float]]
+) -> Optional[List[float]]:
+ """
+ Intersect two 1D intervals ``[lo, hi]``.
+
+ ``None`` means "no constraint on this axis". When the intersection would be
+ empty (conflicting user input), fall back to the most recently provided
+ interval ``b`` so the constraint stays well-formed instead of infeasible.
+ """
+ if a is None:
+ return b
+ if b is None:
+ return a
+ lo = max(a[0], b[0])
+ hi = min(a[1], b[1])
+ if lo > hi:
+ return b
+ return [lo, hi]
+
+
+def _effective_axis_bounds(
+ group: Dict[str, Any], W: float, H: float
+) -> Tuple[Optional[List[float]], Optional[List[float]]]:
+ """
+ Compute the effective per-axis bounds for a group center, in meters.
+
+ Priority per axis::
+
+ explicit coordinate_range > coarse region_box
+
+ The explicit ``coordinate_range`` extracted by Node 2 overrides the coarse
+ ``region_box`` on a per-axis basis so that conflicting coarse regions never
+ invalidate an exact user constraint. The result is finally clamped to the
+ physical table ``[0, W] x [0, H]``.
+
+ Returns ``(xbounds, ybounds)`` where each is ``[lo, hi]`` or ``None``.
+ """
+ box = group.get("region_box")
+ xb = [float(box[0]), float(box[1])] if box is not None else None
+ yb = [float(box[2]), float(box[3])] if box is not None else None
+
+ cr = group.get("coordinate_range")
+ if isinstance(cr, dict):
+ if cr.get("x") is not None:
+ xb = [float(cr["x"][0]), float(cr["x"][1])]
+ if cr.get("y") is not None:
+ yb = [float(cr["y"][0]), float(cr["y"][1])]
+
+ if xb is not None:
+ xb = [max(0.0, xb[0]), min(float(W), xb[1])]
+ if xb[0] > xb[1]:
+ xb = None
+ if yb is not None:
+ yb = [max(0.0, yb[0]), min(float(H), yb[1])]
+ if yb[0] > yb[1]:
+ yb = None
+ return xb, yb
+
+
+def _is_coordinate_point(value: Any) -> bool:
+ """
+ A relation target may now be either an object-id string (e.g. ``"laptop_0"``)
+ or an absolute coordinate point ``[x, y]`` in cm (e.g. ``[10.0, 20.0]``).
+
+ Return True only for the coordinate-point case.
+ """
+ return (
+ isinstance(value, (list, tuple))
+ and len(value) >= 2
+ and isinstance(value[0], (int, float))
+ and not isinstance(value[0], bool)
+ and isinstance(value[1], (int, float))
+ and not isinstance(value[1], bool)
+ )
+
+
+def _region_box(region: str, W: float, H: float):
+ if region == "left_area":
+ return [0.0, W * 0.33, 0.0, H]
+ elif region == "center_area":
+ return [W * 0.33, W * 0.66, 0.0, H]
+ elif region == "right_area":
+ return [W * 0.66, W, 0.0, H]
+ elif region == "front_area":
+ return [0.0, W, 0.0, H * 0.5]
+ elif region == "back_area":
+ return [0.0, W, H * 0.5, H]
+ elif region == "front_left_area":
+ return [0.0, W * 0.33, 0.0, H * 0.5]
+ elif region == "front_right_area":
+ return [W * 0.66, W, 0.0, H * 0.5]
+ elif region == "back_left_area":
+ return [0.0, W * 0.33, H * 0.5, H]
+ elif region == "back_right_area":
+ return [W * 0.66, W, H * 0.5, H]
+ return None
+
+
+def _region_seed(region: str, W: float, H: float) -> Tuple[float, float]:
+ if region == "left_area":
+ return (W * 0.15, H * 0.5)
+ elif region == "center_area":
+ return (W * 0.5, H * 0.5)
+ elif region == "right_area":
+ return (W * 0.85, H * 0.5)
+ elif region == "front_area":
+ return (W * 0.5, H * 0.2)
+ elif region == "back_area":
+ return (W * 0.5, H * 0.8)
+ elif region == "front_left_area":
+ return (W * 0.15, H * 0.2)
+ elif region == "front_right_area":
+ return (W * 0.85, H * 0.2)
+ elif region == "back_left_area":
+ return (W * 0.15, H * 0.8)
+ elif region == "back_right_area":
+ return (W * 0.85, H * 0.8)
+ return (W * 0.5, H * 0.5)
+
+
+def _build_stack_groups_center(
+ raw: Dict[str, Dict],
+ init_layout: Dict[str, Dict],
+ table_size: Tuple[float, float],
+):
+ """
+ Stack_on / Inside_of are treated as binding relations.
+ One optimization variable per group root.
+ Only x/y are optimized.
+
+ init_layout[obj_id]["init_coordinate"] is expected to be:
+ [x, y, rotation]
+ Here we only use x/y for optimization seed.
+ """
+ W, H = [v * 0.01 for v in table_size]
+
+ parent = {}
+ for obj_id, obj in raw.items():
+ ct = obj.get("contact", {})
+ if isinstance(ct, dict) and ct.get("type") in ("Stack_on", "Inside_of"):
+ target = ct.get("target")
+ if target and target in raw:
+ parent[obj_id] = target
+
+ def find_root(node_id: str) -> str:
+ seen = set()
+ cur = node_id
+ while cur in parent:
+ if cur in seen:
+ break
+ seen.add(cur)
+ cur = parent[cur]
+ return cur
+
+ def depth_of(node_id: str) -> int:
+ depth = 0
+ seen = {node_id}
+ cur = node_id
+ while cur in parent:
+ nxt = parent[cur]
+ if nxt in seen:
+ break
+ seen.add(nxt)
+ cur = nxt
+ depth += 1
+ return depth
+
+ root_to_members = defaultdict(list)
+ object_to_root = {}
+
+ for obj_id in raw.keys():
+ root = find_root(obj_id)
+ root_to_members[root].append(obj_id)
+ object_to_root[obj_id] = root
+
+ groups = {}
+ for root_id, members in root_to_members.items():
+ members_sorted = sorted(members, key=lambda oid: depth_of(oid))
+
+ root_obj = raw.get(root_id)
+ region = (
+ root_obj.get("region", "unspecified")
+ if root_obj is not None
+ else "unspecified"
+ )
+
+ fixed_xy = None
+ fixed_source = None
+
+ for oid in members_sorted:
+ coord = raw[oid].get("coordinate", None)
+ if coord is not None:
+ fixed_xy = (float(coord[0] * 0.01), float(coord[1] * 0.01))
+ fixed_source = oid
+ break
+
+ coordinate_range = None
+ for oid in members_sorted:
+ cr = _parse_coordinate_range(raw[oid].get("coordinate_range", None))
+ if cr is None:
+ continue
+ if coordinate_range is None:
+ coordinate_range = {"x": cr["x"], "y": cr["y"]}
+ else:
+ coordinate_range["x"] = _intersect_axis_range(
+ coordinate_range["x"], cr["x"]
+ )
+ coordinate_range["y"] = _intersect_axis_range(
+ coordinate_range["y"], cr["y"]
+ )
+
+ init_xy = None
+ if root_id in init_layout and "init_coordinate" in init_layout[root_id]:
+ coord = init_layout[root_id]["init_coordinate"]
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
+ init_xy = (float(coord[0] * 0.01), float(coord[1] * 0.01))
+ else:
+ for oid in members_sorted:
+ if oid in init_layout and "init_coordinate" in init_layout[oid]:
+ coord = init_layout[oid]["init_coordinate"]
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
+ init_xy = (float(coord[0]), float(coord[1]))
+ break
+
+ if init_xy is None:
+ init_xy = _region_seed(region, W, H)
+
+ region_box = _region_box(region, W, H)
+
+ if fixed_xy is not None:
+ init_xy = fixed_xy
+ else:
+ xb, yb = _effective_axis_bounds(
+ {"region_box": region_box, "coordinate_range": coordinate_range},
+ W,
+ H,
+ )
+ sx, sy = float(init_xy[0]), float(init_xy[1])
+ if xb is not None:
+ sx = float(np.clip(sx, xb[0], xb[1]))
+ if yb is not None:
+ sy = float(np.clip(sy, yb[0], yb[1]))
+ init_xy = (sx, sy)
+
+ groups[root_id] = {
+ "root_id": root_id,
+ "members": members_sorted,
+ "depth_order": {oid: depth_of(oid) for oid in members_sorted},
+ "region": region,
+ "fixed_xy": fixed_xy,
+ "fixed_source": fixed_source,
+ "coordinate_range": coordinate_range,
+ "init_xy": [float(init_xy[0]), float(init_xy[1])],
+ "region_box": region_box,
+ }
+
+ return groups, object_to_root
+
+
+def _load_objects_cfg_by_uid(ec_root: str | Path) -> Dict[str, Dict[str, Any]]:
+ ec_root = Path(ec_root)
+ cfg_path = ec_root / "objects_config_scaled.json"
+ if not cfg_path.exists():
+ raise FileNotFoundError(f"objects_config_scaled.json not found: {cfg_path}")
+
+ data = json.loads(cfg_path.read_text(encoding="utf-8"))
+ rigid = data.get("rigid_object", [])
+ out = {}
+ for obj in rigid:
+ uid = obj.get("uid")
+ if uid:
+ out[uid] = obj
+ return out
+
+
+def rotate_mesh_to_optimization_frame(mesh):
+ # Match gym_export._sim_world_xy_aabb: sim/layout XY is GLB X and -GLB Z.
+ R = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]], dtype=float)
+
+ mesh.vertices = mesh.vertices @ R.T
+ return mesh
+
+
+def _load_mesh_simple(mesh_path: str | Path):
+ mesh_path = Path(mesh_path)
+ if not mesh_path.exists():
+ print(f"[WARN] Mesh file not found: {mesh_path}")
+ return None
+
+ try:
+ mesh = trimesh.load(mesh_path, force="mesh")
+ except Exception as e:
+ print(f"[WARN] Failed to load mesh {mesh_path}: {e}")
+ return None
+
+ if isinstance(mesh, trimesh.Scene):
+ geoms = []
+ for geom in mesh.geometry.values():
+ if geom is not None and geom.faces is not None and len(geom.faces) > 0:
+ geoms.append(geom.copy())
+ if not geoms:
+ print(f"[WARN] Scene has no geometry: {mesh_path}")
+ return None
+ mesh = trimesh.util.concatenate(geoms)
+
+ return mesh
+
+
+def _apply_body_scale(mesh: trimesh.Trimesh, body_scale):
+ if body_scale is None:
+ return mesh
+ try:
+ scale = np.asarray(body_scale, dtype=np.float64).reshape(-1)
+ if scale.size == 3 and not np.allclose(scale, [1.0, 1.0, 1.0]):
+ mesh = mesh.copy()
+ # body_scale is stored in sim axes [X, Y, Z]. Trimesh vertices are
+ # still in GLB axes here, so convert back to [GLB_X, GLB_Y, GLB_Z].
+ mesh.apply_scale([scale[0], scale[2], scale[1]])
+ except Exception:
+ pass
+ return mesh
+
+
+def _get_cfg_init_rot(cfg: Dict[str, Any]) -> np.ndarray:
+ value = cfg.get("init_rot")
+ if not isinstance(value, (list, tuple)) or len(value) < 3:
+ return np.zeros(3, dtype=np.float64)
+ try:
+ rot = np.asarray(value[:3], dtype=np.float64)
+ except (TypeError, ValueError):
+ return np.zeros(3, dtype=np.float64)
+ if rot.shape != (3,) or not np.all(np.isfinite(rot)):
+ return np.zeros(3, dtype=np.float64)
+ return rot
+
+
+def _apply_cfg_rotation(mesh: trimesh.Trimesh, cfg: Dict[str, Any]) -> trimesh.Trimesh:
+ init_rot = _get_cfg_init_rot(cfg)
+ if np.all(np.abs(init_rot) <= 1.0e-8):
+ return mesh
+ import trimesh.transformations as tt
+
+ rotated = mesh.copy()
+ transform = tt.euler_matrix(
+ float(np.deg2rad(init_rot[0])),
+ float(np.deg2rad(init_rot[1])),
+ float(np.deg2rad(init_rot[2])),
+ axes="sxyz",
+ )
+ rotated.apply_transform(transform)
+ return rotated
+
+
+def _prepare_collision_mesh(
+ mesh: trimesh.Trimesh,
+ cfg: Dict[str, Any],
+) -> trimesh.Trimesh:
+ mesh = _apply_body_scale(mesh, cfg.get("body_scale", [1.0, 1.0, 1.0]))
+ mesh = rotate_mesh_to_optimization_frame(mesh)
+ return _apply_cfg_rotation(mesh, cfg)
+
+
+def _get_init_rot_deg(state, uid: str) -> float:
+ init_layout = getattr(state, "init_layout", {})
+ if isinstance(init_layout, dict) and uid in init_layout:
+ coord = init_layout[uid].get("init_coordinate", None)
+ if isinstance(coord, (list, tuple)) and len(coord) >= 3:
+ try:
+ return float(coord[2])
+ except Exception:
+ pass
+ return 0.0
+
+
+def _pose_from_center_xy(mesh: trimesh.Trimesh, x: float, y: float, z: float = 0.0):
+ bounds = np.asarray(mesh.bounds, dtype=np.float64)
+ center_xy = 0.5 * (bounds[0, :2] + bounds[1, :2])
+ min_z = float(bounds[0, 2])
+ pose = np.eye(4, dtype=np.float64)
+ pose[:3, 3] = [
+ float(x) - float(center_xy[0]),
+ float(y) - float(center_xy[1]),
+ float(z) - min_z,
+ ]
+ return pose
+
+
+def _world_bounds(mesh: trimesh.Trimesh, pose: np.ndarray):
+ m = mesh.copy()
+ m.apply_transform(pose)
+ bmin, bmax = m.bounds
+ bmin = np.asarray(bmin, dtype=float).ravel()
+ bmax = np.asarray(bmax, dtype=float).ravel()
+ return bmin, bmax
+
+
+def _render_collision_mesh_topdown(
+ *,
+ output_path: Path,
+ mesh_dict: Dict[str, trimesh.Trimesh],
+ pose_dict: Dict[str, np.ndarray],
+ table_size: Tuple[float, float],
+ title: str,
+) -> None:
+ if not mesh_dict:
+ return
+ try:
+ import matplotlib
+
+ matplotlib.use("Agg", force=True)
+ import matplotlib.pyplot as plt
+ from matplotlib.collections import PolyCollection
+ except Exception as exc:
+ print(f"[WARN] Failed to import matplotlib for collision render: {exc}")
+ return
+
+ W, H = [float(v) * 0.01 for v in table_size]
+ fig, ax = plt.subplots(figsize=(8, 8), dpi=180)
+ colors = plt.cm.tab20(np.linspace(0.0, 1.0, max(len(mesh_dict), 1)))
+
+ all_xy: list[np.ndarray] = []
+ for index, uid in enumerate(sorted(mesh_dict)):
+ mesh = mesh_dict[uid].copy()
+ pose = pose_dict.get(uid)
+ if pose is not None:
+ mesh.apply_transform(pose)
+ vertices = np.asarray(mesh.vertices, dtype=np.float64)
+ faces = np.asarray(mesh.faces, dtype=np.int64)
+ if vertices.size == 0 or faces.size == 0:
+ continue
+
+ xy = vertices[:, :2]
+ all_xy.append(xy)
+ polygons = xy[faces]
+ color = colors[index % len(colors)]
+ collection = PolyCollection(
+ polygons,
+ facecolors=[color],
+ edgecolors=[(0.0, 0.0, 0.0, 0.18)],
+ linewidths=0.15,
+ alpha=0.34,
+ )
+ ax.add_collection(collection)
+
+ bounds = mesh.bounds
+ center_xy = 0.5 * (bounds[0, :2] + bounds[1, :2])
+ ax.text(
+ float(center_xy[0]),
+ float(center_xy[1]),
+ uid,
+ fontsize=6,
+ ha="center",
+ va="center",
+ color="black",
+ bbox={"facecolor": "white", "alpha": 0.65, "edgecolor": "none", "pad": 1.0},
+ )
+
+ ax.add_patch(
+ plt.Rectangle(
+ (0.0, 0.0),
+ W,
+ H,
+ fill=False,
+ edgecolor="black",
+ linewidth=1.2,
+ linestyle="--",
+ )
+ )
+ ax.set_title(title)
+ ax.set_xlabel("SA/layout X (m)")
+ ax.set_ylabel("SA/layout Y (m)")
+ ax.set_aspect("equal", adjustable="box")
+ ax.grid(True, linestyle=":", linewidth=0.5, alpha=0.5)
+
+ if all_xy:
+ stacked = np.vstack(all_xy)
+ min_xy = np.minimum(stacked.min(axis=0), np.array([0.0, 0.0]))
+ max_xy = np.maximum(stacked.max(axis=0), np.array([W, H]))
+ else:
+ min_xy = np.array([0.0, 0.0])
+ max_xy = np.array([W, H])
+ span = np.maximum(max_xy - min_xy, 1.0e-3)
+ pad = np.maximum(span * 0.08, 0.02)
+ ax.set_xlim(float(min_xy[0] - pad[0]), float(max_xy[0] + pad[0]))
+ ax.set_ylim(float(min_xy[1] - pad[1]), float(max_xy[1] + pad[1]))
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ fig.tight_layout()
+ fig.savefig(output_path, facecolor="white")
+ plt.close(fig)
+
+
+def _pair_separation_from_bounds(bmin_a, bmax_a, bmin_b, bmax_b, margin: float = 0.02):
+ bmin_a = np.asarray(bmin_a, dtype=float)
+ bmax_a = np.asarray(bmax_a, dtype=float)
+ bmin_b = np.asarray(bmin_b, dtype=float)
+ bmax_b = np.asarray(bmax_b, dtype=float)
+
+ overlap_x = min(bmax_a[0], bmax_b[0]) - max(bmin_a[0], bmin_b[0])
+ overlap_y = min(bmax_a[1], bmax_b[1]) - max(bmin_a[1], bmin_b[1])
+ overlap_z = min(bmax_a[2], bmax_b[2]) - max(bmin_a[2], bmin_b[2])
+
+ if overlap_x <= 0 or overlap_y <= 0 or overlap_z <= 0:
+ return None, None, None
+
+ center_a = 0.5 * (bmin_a + bmax_a)
+ center_b = 0.5 * (bmin_b + bmax_b)
+
+ if overlap_x <= overlap_y:
+ sign = 1.0 if center_a[0] >= center_b[0] else -1.0
+ direction_2d = np.array([sign, 0.0], dtype=float)
+ required_sep = float(overlap_x + margin)
+ severity = float(overlap_x)
+ else:
+ sign = 1.0 if center_a[1] >= center_b[1] else -1.0
+ direction_2d = np.array([0.0, sign], dtype=float)
+ required_sep = float(overlap_y + margin)
+ severity = float(overlap_y)
+
+ return direction_2d, required_sep, severity
+
+
+def _clamp_group_center(
+ gid: str, xy: np.ndarray, groups: Dict[str, Dict], table_size: Tuple[float, float]
+) -> np.ndarray:
+ info = groups[gid]
+ if info.get("fixed_xy") is not None:
+ return np.asarray(info["fixed_xy"], dtype=float)
+
+ W, H = [v * 0.01 for v in table_size]
+ x, y = float(xy[0]), float(xy[1])
+
+ x = float(np.clip(x, 0.0, W))
+ y = float(np.clip(y, 0.0, H))
+
+ xb, yb = _effective_axis_bounds(info, W, H)
+ if xb is not None:
+ x = float(np.clip(x, xb[0], xb[1]))
+ if yb is not None:
+ y = float(np.clip(y, yb[0], yb[1]))
+
+ return np.array([x, y], dtype=float)
+
+
+def _print_collision_item(item: Dict[str, Any]):
+ print(
+ f"[COLLISION] {item['a']}({item['group_a']}) <-> {item['b']}({item['group_b']}) | "
+ f"dir={item['direction_2d'].tolist()} | required_sep={item['required_sep']:.4f} | "
+ f"severity={item['severity']:.4f} | "
+ f"overlaps: x={item['overlap_x']:.4f}, y={item['overlap_y']:.4f}, z={item['overlap_z']:.4f}"
+ )
+
+
+def _greedy_push_apart(
+ current_centers: Dict[str, np.ndarray],
+ collisions: List[Dict[str, Any]],
+ groups: Dict[str, Dict],
+ table_size: Tuple[float, float],
+ push_scale: float = 0.05,
+):
+ new_centers = {
+ gid: np.asarray(xy, dtype=float).reshape(2).copy()
+ for gid, xy in current_centers.items()
+ }
+
+ for item in collisions:
+ ga = item["group_a"]
+ gb = item["group_b"]
+
+ dir2d = np.asarray(item["direction_2d"], dtype=float).reshape(2)
+ norm = float(np.linalg.norm(dir2d))
+ if norm < 1e-8:
+ continue
+ dir2d = dir2d / norm
+
+ step = max(0.5, float(item["required_sep"]) * push_scale)
+
+ a_fixed = groups[ga].get("fixed_xy") is not None
+ b_fixed = groups[gb].get("fixed_xy") is not None
+
+ if a_fixed and b_fixed:
+ continue
+ elif a_fixed and not b_fixed:
+ new_centers[gb] += dir2d * step
+ elif b_fixed and not a_fixed:
+ new_centers[ga] -= dir2d * step
+ else:
+ new_centers[ga] += dir2d * (step * 0.5)
+ new_centers[gb] -= dir2d * (step * 0.5)
+
+ for gid in list(new_centers.keys()):
+ new_centers[gid] = _clamp_group_center(
+ gid, new_centers[gid], groups, table_size
+ )
+
+ return new_centers
+
+
+def _local_z_interval(mesh: trimesh.Trimesh, rot_deg: float):
+ m = mesh.copy()
+ center = np.asarray(m.bounds.mean(axis=0), dtype=np.float64)
+
+ theta = np.deg2rad(float(rot_deg))
+ c = float(np.cos(theta))
+ s = float(np.sin(theta))
+
+ t1 = np.eye(4, dtype=np.float64)
+ t1[:3, 3] = -center
+
+ rz = np.array(
+ [
+ [c, -s, 0.0, 0.0],
+ [s, c, 0.0, 0.0],
+ [0.0, 0.0, 1.0, 0.0],
+ [0.0, 0.0, 0.0, 1.0],
+ ],
+ dtype=np.float64,
+ )
+
+ m.apply_transform(rz @ t1)
+ bmin, bmax = m.bounds
+ return float(bmin[2]), float(bmax[2])
+
+
+def _refine_stack_group_z(
+ state: SceneState,
+ ec_root: str | Path,
+ optimized_layout: Dict[str, Dict],
+ groups: Dict[str, Dict],
+ z_gap: float = 0.01,
+):
+ cfg_by_uid = _load_objects_cfg_by_uid(ec_root)
+
+ final_layout = copy.deepcopy(optimized_layout)
+
+ mesh_cache = {}
+ for uid in final_layout.keys():
+ cfg = cfg_by_uid.get(uid)
+ if cfg is None:
+ continue
+
+ shape = cfg.get("shape", {})
+ fpath = shape.get("fpath")
+ if not fpath:
+ continue
+
+ mesh_path = Path(ec_root) / fpath
+ mesh = _load_mesh_simple(mesh_path)
+ if mesh is None:
+ continue
+
+ mesh = _apply_body_scale(mesh, cfg.get("body_scale", [1.0, 1.0, 1.0]))
+ mesh = rotate_mesh_to_optimization_frame(mesh)
+ mesh_cache[uid] = mesh
+
+ for gid, group in groups.items():
+ members = group["members"]
+ if not members:
+ continue
+
+ root_id = members[0]
+ root_z = 0.0
+
+ if root_id in final_layout and "z" in final_layout[root_id]:
+ try:
+ root_z = float(final_layout[root_id]["z"])
+ except Exception:
+ pass
+
+ prev_top = None
+ for idx, uid in enumerate(members):
+ if uid not in mesh_cache:
+ continue
+
+ rot_deg = _get_init_rot_deg(state, uid)
+ local_min_z, local_max_z = _local_z_interval(
+ mesh_cache[uid], rot_deg=rot_deg
+ )
+
+ if idx == 0:
+ z_center = float(root_z)
+ else:
+ z_center = float(prev_top - local_min_z + z_gap)
+
+ prev_top = z_center + local_max_z
+
+ if uid not in final_layout:
+ final_layout[uid] = {}
+
+ final_layout[uid]["z"] = float(z_center)
+ if "center_2d" in final_layout[uid]:
+ final_layout[uid]["center_3d"] = [
+ float(final_layout[uid]["center_2d"][0]),
+ float(final_layout[uid]["center_2d"][1]),
+ float(z_center),
+ ]
+
+ return final_layout
+
+
+def _solve_group_model(
+ model: Dict[str, Any], seed_centers: Optional[Dict[str, np.ndarray]] = None
+):
+ group_ids = model["group_ids"]
+ group_index = model["group_index"]
+ groups = model["groups"]
+
+ if seed_centers is None:
+ seed_centers = {
+ gid: np.asarray(groups[gid]["init_xy"], dtype=float) for gid in group_ids
+ }
+
+ x0 = []
+ for gid in group_ids:
+ xy = np.asarray(
+ seed_centers.get(gid, groups[gid]["init_xy"]), dtype=float
+ ).reshape(2)
+ x0.extend([float(xy[0]), float(xy[1])])
+ x0 = np.asarray(x0, dtype=float)
+
+ def unpack(xvec):
+ out = {}
+ for gid, idx in group_index.items():
+ out[gid] = np.array([xvec[2 * idx], xvec[2 * idx + 1]], dtype=float)
+ return out
+
+ def objective(xvec):
+ coords = unpack(xvec)
+ loss = 0.0
+
+ for gid in group_ids:
+ init_xy = np.asarray(
+ seed_centers.get(gid, groups[gid]["init_xy"]), dtype=float
+ )
+ loss += 5 * float(np.sum((coords[gid] - init_xy) ** 2))
+
+ min_dist = 0.01
+ for i in range(len(group_ids)):
+ for j in range(i + 1, len(group_ids)):
+ a = group_ids[i]
+ b = group_ids[j]
+ d = float(np.linalg.norm(coords[a] - coords[b]))
+ overlap = max(0.0, min_dist - d)
+ loss += 0.05 * (overlap**2)
+
+ return loss
+
+ constraints = []
+
+ for row, b in zip(model["A_ub"], model["b_ub"]):
+ row = np.asarray(row, dtype=float)
+ b = float(b)
+ constraints.append(
+ {
+ "type": "ineq",
+ "fun": lambda xvec, row=row, b=b: b - float(np.dot(row, xvec)),
+ }
+ )
+
+ for row, b in zip(model["A_eq"], model["b_eq"]):
+ row = np.asarray(row, dtype=float)
+ b = float(b)
+ constraints.append(
+ {
+ "type": "eq",
+ "fun": lambda xvec, row=row, b=b: float(np.dot(row, xvec)) - b,
+ }
+ )
+
+ result = minimize(
+ objective,
+ x0,
+ method="SLSQP",
+ constraints=constraints,
+ options={"maxiter": 500, "ftol": 1e-6, "disp": False},
+ )
+
+ solved = unpack(result.x)
+ return result, solved
+
+
+def _build_collision_scene(
+ state: SceneState | Tempo_SceneState,
+ ec_root: str | Path,
+ group_centers: Dict[str, np.ndarray],
+ object_to_group: Dict[str, str],
+):
+ cfg_by_uid = _load_objects_cfg_by_uid(ec_root)
+
+ try:
+ cm = trimesh.collision.CollisionManager()
+ except Exception:
+ cm = None
+
+ mesh_dict = {}
+ pose_dict = {}
+
+ for uid, cfg in cfg_by_uid.items():
+ gid = object_to_group.get(uid)
+ if gid not in group_centers:
+ continue
+
+ shape = cfg.get("shape", {})
+ fpath = shape.get("fpath")
+ if not fpath:
+ continue
+
+ mesh_path = Path(ec_root) / fpath
+ mesh = _load_mesh_simple(mesh_path)
+ if mesh is None:
+ continue
+
+ mesh = _prepare_collision_mesh(mesh, cfg)
+
+ center_xy = np.asarray(group_centers[gid], dtype=float).reshape(2)
+ pose = _pose_from_center_xy(mesh, float(center_xy[0]), float(center_xy[1]))
+
+ mesh_dict[uid] = mesh
+ pose_dict[uid] = pose
+
+ if cm is not None:
+ try:
+ cm.add_object(uid, mesh, transform=pose)
+ except Exception:
+ pass
+
+ return cm, mesh_dict, pose_dict
+
+
+def _detect_collision_pairs(
+ cm,
+ mesh_dict: Dict[str, trimesh.Trimesh],
+ pose_dict: Dict[str, np.ndarray],
+ object_to_group: Dict[str, str],
+):
+ results = []
+ ids = list(mesh_dict.keys())
+
+ seen = set()
+ if cm is not None:
+ for uid in ids:
+ try:
+ names = cm.in_collision_other(
+ mesh_dict[uid], transform=pose_dict[uid], return_names=True
+ )
+ except Exception:
+ names = []
+ for other in names or []:
+ if other == uid:
+ continue
+ key = tuple(sorted((uid, other)))
+ if key in seen:
+ continue
+ seen.add(key)
+
+ bmin_a, bmax_a = _world_bounds(mesh_dict[uid], pose_dict[uid])
+ bmin_b, bmax_b = _world_bounds(mesh_dict[other], pose_dict[other])
+
+ dir2d, required_sep, severity = _pair_separation_from_bounds(
+ bmin_a, bmax_a, bmin_b, bmax_b
+ )
+ if dir2d is None:
+ continue
+
+ overlap_x = min(bmax_a[0], bmax_b[0]) - max(bmin_a[0], bmin_b[0])
+ overlap_y = min(bmax_a[1], bmax_b[1]) - max(bmin_a[1], bmin_b[1])
+ overlap_z = min(bmax_a[2], bmax_b[2]) - max(bmin_a[2], bmin_b[2])
+
+ results.append(
+ {
+ "a": uid,
+ "b": other,
+ "group_a": object_to_group.get(uid, uid),
+ "group_b": object_to_group.get(other, other),
+ "direction_2d": dir2d,
+ "required_sep": required_sep,
+ "severity": severity,
+ "overlap_x": float(overlap_x),
+ "overlap_y": float(overlap_y),
+ "overlap_z": float(overlap_z),
+ }
+ )
+ else:
+ for i in range(len(ids)):
+ for j in range(i + 1, len(ids)):
+ a = ids[i]
+ b = ids[j]
+ bmin_a, bmax_a = _world_bounds(mesh_dict[a], pose_dict[a])
+ bmin_b, bmax_b = _world_bounds(mesh_dict[b], pose_dict[b])
+ dir2d, required_sep, severity = _pair_separation_from_bounds(
+ bmin_a, bmax_a, bmin_b, bmax_b
+ )
+ if dir2d is None:
+ continue
+ overlap_x = min(bmax_a[0], bmax_b[0]) - max(bmin_a[0], bmin_b[0])
+ overlap_y = min(bmax_a[1], bmax_b[1]) - max(bmin_a[1], bmin_b[1])
+ overlap_z = min(bmax_a[2], bmax_b[2]) - max(bmin_a[2], bmin_b[2])
+ results.append(
+ {
+ "a": a,
+ "b": b,
+ "group_a": object_to_group.get(a, a),
+ "group_b": object_to_group.get(b, b),
+ "direction_2d": dir2d,
+ "required_sep": required_sep,
+ "severity": severity,
+ "overlap_x": float(overlap_x),
+ "overlap_y": float(overlap_y),
+ "overlap_z": float(overlap_z),
+ }
+ )
+
+ results.sort(key=lambda item: float(item["severity"]), reverse=True)
+ return results
+
+
+def _add_pair_separation_constraint(
+ model: Dict[str, Any],
+ ga: str,
+ gb: str,
+ direction_2d: np.ndarray,
+ required_sep: float,
+):
+ group_index = model["group_index"]
+ if ga not in group_index or gb not in group_index:
+ return
+
+ ia = group_index[ga]
+ ib = group_index[gb]
+
+ dx, dy = float(direction_2d[0]), float(direction_2d[1])
+ row = np.zeros(2 * len(model["group_ids"]), dtype=float)
+ row[2 * ia] = -dx
+ row[2 * ia + 1] = -dy
+ row[2 * ib] = dx
+ row[2 * ib + 1] = dy
+
+ model["A_ub"].append(row.tolist())
+ model["b_ub"].append(float(-required_sep))
+
+
+def _build_work_model(base_model: Dict[str, Any], collision_terms: List[Dict[str, Any]]):
+ model = copy.deepcopy(base_model)
+ for term in collision_terms:
+ _add_pair_separation_constraint(
+ model,
+ term["ga"],
+ term["gb"],
+ np.asarray(term["direction_2d"], dtype=float),
+ float(term["margin"]),
+ )
+ return model
+
+
+def _group_half_extents_xy(
+ *,
+ ec_root: str | Path,
+ groups: Dict[str, Dict],
+ object_to_group: Dict[str, str],
+) -> Dict[str, np.ndarray]:
+ cfg_by_uid = _load_objects_cfg_by_uid(ec_root)
+ extents: Dict[str, np.ndarray] = {
+ gid: np.zeros(2, dtype=np.float64) for gid in groups
+ }
+ for uid, cfg in cfg_by_uid.items():
+ gid = object_to_group.get(uid)
+ if gid not in extents:
+ continue
+ shape = cfg.get("shape", {})
+ fpath = shape.get("fpath")
+ if not fpath:
+ continue
+ mesh_path = Path(ec_root) / fpath
+ mesh = _load_mesh_simple(mesh_path)
+ if mesh is None:
+ continue
+ mesh = _prepare_collision_mesh(mesh, cfg)
+ bounds = np.asarray(mesh.bounds, dtype=np.float64)
+ if bounds.shape != (2, 3):
+ continue
+ half_xy = 0.5 * np.maximum(bounds[1, :2] - bounds[0, :2], 0.0)
+ extents[gid] = np.maximum(extents[gid], half_xy)
+ return extents
+
+
+def run_node_3_5(state: Tempo_SceneState, ec_root: str | Path) -> Tempo_SceneState:
+ print(">>> Node 3.5: compiling center-point model and optimizing...")
+
+ raw = state.raw_object_dict
+ if not raw or state.table_size is None or not state.init_layout:
+ state.messages.append("Node 3.5 skipped: missing inputs")
+ return state
+
+ object_items = {
+ obj_id: obj for obj_id, obj in raw.items() if not obj_id.startswith("table_")
+ }
+
+ W, H = [v * 0.01 for v in state.table_size]
+
+ groups, object_to_group = _build_stack_groups_center(
+ object_items, state.init_layout, state.table_size
+ )
+ state.stack_groups = groups
+ group_half_extents = _group_half_extents_xy(
+ ec_root=ec_root,
+ groups=groups,
+ object_to_group=object_to_group,
+ )
+
+ group_ids = list(groups.keys())
+ group_index = {gid: i for i, gid in enumerate(group_ids)}
+
+ def gid_of(obj_id: str) -> str:
+ return object_to_group.get(obj_id, obj_id)
+
+ A_ub, b_ub = [], []
+ A_eq, b_eq = [], []
+ relation_terms = []
+ clearance = 0.03
+
+ def add_ub(coeffs: Dict[int, float], rhs: float):
+ row = np.zeros(2 * len(group_ids), dtype=float)
+ for idx, val in coeffs.items():
+ row[idx] = float(val)
+ A_ub.append(row.tolist())
+ b_ub.append(float(rhs))
+
+ def add_eq(coeffs: Dict[int, float], rhs: float):
+ row = np.zeros(2 * len(group_ids), dtype=float)
+ for idx, val in coeffs.items():
+ row[idx] = float(val)
+ A_eq.append(row.tolist())
+ b_eq.append(float(rhs))
+
+ for gid in group_ids:
+ i = group_index[gid]
+
+ add_ub({2 * i: -1.0}, 0.0)
+ add_ub({2 * i: 1.0}, W)
+ add_ub({2 * i + 1: -1.0}, 0.0)
+ add_ub({2 * i + 1: 1.0}, H)
+
+ if groups[gid]["fixed_xy"] is not None:
+ fx, fy = groups[gid]["fixed_xy"]
+ add_eq({2 * i: 1.0}, fx)
+ add_eq({2 * i + 1: 1.0}, fy)
+ continue
+
+ xb, yb = _effective_axis_bounds(groups[gid], W, H)
+ if xb is not None:
+ add_ub({2 * i: -1.0}, -xb[0])
+ add_ub({2 * i: 1.0}, xb[1])
+ if yb is not None:
+ add_ub({2 * i + 1: -1.0}, -yb[0])
+ add_ub({2 * i + 1: 1.0}, yb[1])
+
+ def add_relation_constraints(
+ src_i: int,
+ src_gid: str,
+ src_id: str,
+ targets,
+ rel_type: str,
+ ):
+ for tgt in targets or []:
+ if _is_coordinate_point(tgt):
+ px = float(tgt[0]) * 0.01
+ py = float(tgt[1]) * 0.01
+ if rel_type == "left_of":
+ add_ub({2 * src_i: 1.0}, px - clearance)
+ elif rel_type == "right_of":
+ add_ub({2 * src_i: -1.0}, -(px + clearance))
+ elif rel_type == "front_of":
+ add_ub({2 * src_i + 1: 1.0}, py - clearance)
+ elif rel_type == "back_of":
+ add_ub({2 * src_i + 1: -1.0}, -(py + clearance))
+ relation_terms.append(
+ {
+ "source": src_id,
+ "target_point": [px, py],
+ "type": rel_type,
+ "source_group": src_gid,
+ "target_group": None,
+ "gap": clearance,
+ }
+ )
+ continue
+
+ tgt_id = tgt
+ tgt_gid = gid_of(tgt_id)
+ if tgt_gid == src_gid or tgt_gid not in group_index:
+ continue
+ tgt_j = group_index[tgt_gid]
+ src_half = group_half_extents.get(src_gid, np.zeros(2, dtype=np.float64))
+ tgt_half = group_half_extents.get(tgt_gid, np.zeros(2, dtype=np.float64))
+ if rel_type == "left_of":
+ gap = float(src_half[0] + tgt_half[0])
+ add_ub({2 * src_i: 1.0, 2 * tgt_j: -1.0}, -gap)
+ elif rel_type == "right_of":
+ gap = float(src_half[0] + tgt_half[0])
+ add_ub({2 * tgt_j: 1.0, 2 * src_i: -1.0}, -gap)
+ elif rel_type == "front_of":
+ gap = float(src_half[1] + tgt_half[1])
+ add_ub({2 * src_i + 1: 1.0, 2 * tgt_j + 1: -1.0}, -gap)
+ elif rel_type == "back_of":
+ gap = float(src_half[1] + tgt_half[1])
+ add_ub({2 * tgt_j + 1: 1.0, 2 * src_i + 1: -1.0}, -gap)
+ else:
+ gap = clearance
+ relation_terms.append(
+ {
+ "source": src_id,
+ "target": tgt_id,
+ "type": rel_type,
+ "source_group": src_gid,
+ "target_group": tgt_gid,
+ "gap": gap,
+ }
+ )
+
+ for src_id, obj in object_items.items():
+ src_gid = gid_of(src_id)
+ if src_gid not in group_index:
+ continue
+ src_i = group_index[src_gid]
+
+ rel = obj.get("relation", {}) or {}
+ if not isinstance(rel, dict):
+ rel = {}
+
+ add_relation_constraints(
+ src_i, src_gid, src_id, rel.get("left_of", []), "left_of"
+ )
+ add_relation_constraints(
+ src_i, src_gid, src_id, rel.get("right_of", []), "right_of"
+ )
+ add_relation_constraints(
+ src_i, src_gid, src_id, rel.get("front_of", []), "front_of"
+ )
+ add_relation_constraints(
+ src_i, src_gid, src_id, rel.get("back_of", []), "back_of"
+ )
+
+ if rel.get("towards_to"):
+ relation_terms.append(
+ {
+ "source": src_id,
+ "type": "towards_to",
+ "targets": rel.get("towards_to", []),
+ "ignored_in_stage": True,
+ }
+ )
+
+ base_model = {
+ "variable_type": "group_root_center_2d",
+ "group_ids": group_ids,
+ "variable_order": group_ids,
+ "group_index": group_index,
+ "object_to_group": object_to_group,
+ "groups": groups,
+ "A_ub": A_ub,
+ "b_ub": b_ub,
+ "A_eq": A_eq,
+ "b_eq": b_eq,
+ "relation_terms": relation_terms,
+ "table_bounds": {"x_range": [0.0, W], "y_range": [0.0, H]},
+ "collision_terms": [],
+ "notes": [
+ "Center-point optimization only.",
+ "Stack_on and Inside_of are bound by group roots.",
+ "rotation is not part of optimization variables.",
+ "rotation is only used when building collision geometry.",
+ "collision constraints are added only to temporary working models.",
+ "coordinate_range (cm->m) is added as per-axis hard bounds and "
+ "overrides the coarse region box.",
+ "relation targets may be object ids or absolute points [x, y] (cm); "
+ "points become single-variable bounds.",
+ ],
+ }
+
+ state.optimization_model = base_model
+
+ result, solved = _solve_group_model(base_model, seed_centers=None)
+ if not result.success:
+ state.messages.append(
+ f"Node 3.5 warning: optimizer did not fully converge: {result.message}"
+ )
+ print(f"[WARN] optimizer did not fully converge: {result.message}")
+
+ current_centers = {
+ gid: np.asarray(solved[gid], dtype=float).reshape(2) for gid in group_ids
+ }
+
+ max_rounds = 8
+ max_added_pairs = 64
+ collision_margin = 0.02
+
+ seen_pair_keys = set()
+ collision_terms_history: List[Dict[str, Any]] = []
+ debug_renders: Dict[str, str] = {}
+
+ cm_initial, mesh_dict_initial, pose_dict_initial = _build_collision_scene(
+ state,
+ ec_root,
+ current_centers,
+ base_model["object_to_group"],
+ )
+ before_render_path = Path(ec_root) / "collision_mesh_before.png"
+ _render_collision_mesh_topdown(
+ output_path=before_render_path,
+ mesh_dict=mesh_dict_initial,
+ pose_dict=pose_dict_initial,
+ table_size=state.table_size,
+ title="SA collision mesh before collision refinement",
+ )
+ if before_render_path.is_file():
+ debug_renders["collision_mesh_before"] = str(before_render_path)
+
+ for round_idx in range(max_rounds):
+ print(f"\n>>> Collision refinement round {round_idx + 1}")
+
+ cm, mesh_dict, pose_dict = _build_collision_scene(
+ state,
+ ec_root,
+ current_centers,
+ base_model["object_to_group"],
+ )
+ collisions = _detect_collision_pairs(
+ cm, mesh_dict, pose_dict, base_model["object_to_group"]
+ )
+
+ if not collisions:
+ state.messages.append(
+ f"Node 3.5 collision refinement finished at round {round_idx}: no collisions"
+ )
+ break
+
+ added_this_round = 0
+
+ for item in collisions[:max_added_pairs]:
+ ga = item["group_a"]
+ gb = item["group_b"]
+ if ga == gb:
+ continue
+
+ pair_key = tuple(sorted((ga, gb)))
+ if pair_key in seen_pair_keys:
+ continue
+
+ dir2d = np.asarray(item["direction_2d"], dtype=float).reshape(2)
+ if np.linalg.norm(dir2d) < 1e-8:
+ continue
+
+ required_sep = float(item["required_sep"]) + float(collision_margin)
+
+ collision_terms_history.append(
+ {
+ "ga": ga,
+ "gb": gb,
+ "direction_2d": dir2d.tolist(),
+ "margin": required_sep,
+ }
+ )
+ seen_pair_keys.add(pair_key)
+ added_this_round += 1
+
+ if added_this_round == 0:
+ print(
+ "[COLLISION] no new constraints from current round, using greedy fallback"
+ )
+ current_centers = _greedy_push_apart(
+ current_centers,
+ collisions,
+ groups,
+ state.table_size,
+ push_scale=0.05,
+ )
+ continue
+
+ work_model = _build_work_model(base_model, collision_terms_history)
+
+ result, solved = _solve_group_model(work_model, seed_centers=current_centers)
+ if not result.success:
+ msg = f"Node 3.5 collision refinement warning round {round_idx}: optimizer failed: {result.message}"
+ state.messages.append(msg)
+ print(f"[WARN] {msg}")
+
+ current_centers = _greedy_push_apart(
+ current_centers,
+ collisions,
+ groups,
+ state.table_size,
+ push_scale=0.5,
+ )
+ continue
+
+ new_centers = {
+ gid: np.asarray(solved[gid], dtype=float).reshape(2)
+ for gid in base_model["group_ids"]
+ }
+
+ max_move = 0.0
+ for gid in base_model["group_ids"]:
+ delta = np.linalg.norm(new_centers[gid] - current_centers[gid])
+ max_move = max(max_move, float(delta))
+
+ current_centers = new_centers
+ state.messages.append(
+ f"Node 3.5 collision refinement round {round_idx + 1}: "
+ f"constraints={len(collision_terms_history)}, max_move={max_move:.4f}"
+ )
+ print(
+ f"[SOLVE] done | constraints={len(collision_terms_history)} | max_move={max_move:.4f}"
+ )
+
+ cm2, mesh_dict2, pose_dict2 = _build_collision_scene(
+ state,
+ ec_root,
+ current_centers,
+ base_model["object_to_group"],
+ )
+ post_collisions = _detect_collision_pairs(
+ cm2, mesh_dict2, pose_dict2, base_model["object_to_group"]
+ )
+
+ if not post_collisions:
+ state.messages.append(
+ f"Node 3.5 collision refinement converged at round {round_idx + 1}"
+ )
+ print("[COLLISION] fully resolved")
+ break
+ else:
+ print(
+ f"[COLLISION] still unresolved after solve: {len(post_collisions)} pairs"
+ )
+ for item in post_collisions:
+ _print_collision_item(item)
+
+ if max_move < 1e-4:
+ break
+
+ for safety_round in range(5):
+ cm_s, mesh_dict_s, pose_dict_s = _build_collision_scene(
+ state,
+ ec_root,
+ current_centers,
+ base_model["object_to_group"],
+ )
+ remaining = _detect_collision_pairs(
+ cm_s, mesh_dict_s, pose_dict_s, base_model["object_to_group"]
+ )
+ if not remaining:
+ break
+ print(
+ f"[COLLISION] safety push round {safety_round + 1}, remaining={len(remaining)}"
+ )
+ current_centers = _greedy_push_apart(
+ current_centers,
+ remaining,
+ groups,
+ state.table_size,
+ push_scale=0.05,
+ )
+
+ optimized_layout = {}
+ for gid in group_ids:
+ root_xy = current_centers[gid]
+ members = groups[gid]["members"]
+
+ for stack_level, obj_id in enumerate(members):
+ optimized_layout[obj_id] = {
+ "group_root": gid,
+ "stack_level": stack_level,
+ "is_fixed": groups[gid]["fixed_xy"] is not None,
+ "center_2d": [float(root_xy[0]), float(root_xy[1])],
+ "region": groups[gid]["region"],
+ "rotation_deg": float(_get_init_rot_deg(state, obj_id)),
+ "contact": object_items[obj_id].get("contact", {}),
+ "relation": object_items[obj_id].get("relation", {}),
+ }
+
+ optimized_layout = _refine_stack_group_z(
+ state=state,
+ ec_root=ec_root,
+ optimized_layout=optimized_layout,
+ groups=groups,
+ z_gap=0.01,
+ )
+
+ state.optimization_model = base_model
+ state.optimization_model["collision_terms"] = collision_terms_history
+ state.optimization_model["debug_renders"] = debug_renders
+ state.optimized_layout = optimized_layout
+ state.optimized_group_centers = {
+ gid: [float(v[0]), float(v[1])] for gid, v in current_centers.items()
+ }
+
+ cm_f, mesh_dict_f, pose_dict_f = _build_collision_scene(
+ state,
+ ec_root,
+ current_centers,
+ base_model["object_to_group"],
+ )
+ after_render_path = Path(ec_root) / "collision_mesh_after.png"
+ _render_collision_mesh_topdown(
+ output_path=after_render_path,
+ mesh_dict=mesh_dict_f,
+ pose_dict=pose_dict_f,
+ table_size=state.table_size,
+ title="SA collision mesh after collision refinement",
+ )
+ if after_render_path.is_file():
+ debug_renders["collision_mesh_after"] = str(after_render_path)
+ state.optimization_model["debug_renders"] = debug_renders
+
+ final_collisions = _detect_collision_pairs(
+ cm_f, mesh_dict_f, pose_dict_f, base_model["object_to_group"]
+ )
+ if final_collisions:
+ state.messages.append(
+ f"Node 3.5 finished with remaining collisions: {len(final_collisions)}"
+ )
+ print(f"[COLLISION] final remaining collisions: {len(final_collisions)}")
+ for item in final_collisions:
+ _print_collision_item(item)
+ else:
+ state.messages.append("Node 3.5 completed with no collisions")
+ print("[COLLISION] final state clean")
+
+ state.messages.append(
+ f"Node 3.5 completed: optimized {len(optimized_layout)} objects"
+ )
+ return state
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_state.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_state.py
new file mode 100644
index 000000000..a91e23e16
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/sa_state.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+
+@dataclass
+class SceneState:
+ raw_input_json: Dict = field(default_factory=dict)
+ table_size: Optional[Tuple[float, float]] = None
+ table_semantics: str = ""
+ coordinate_system: Dict = field(default_factory=dict)
+ raw_object_dict: Dict[str, Dict] = field(default_factory=dict)
+ init_layout: Dict[str, Dict] = field(default_factory=dict)
+ optimization_model: Dict[str, Any] = field(default_factory=dict)
+ optimized_layout: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ stack_groups: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ asset_specs: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ messages: List[str] = field(default_factory=list)
+
+
+@dataclass
+class Tempo_SceneState:
+ raw_input_json: Dict = field(default_factory=dict)
+ table_size: Optional[Tuple[float, float]] = None
+ table_semantics: str = ""
+ coordinate_system: Dict = field(default_factory=dict)
+ raw_object_dict: Dict[str, Dict] = field(default_factory=dict)
+ filtered_objects_info: Dict[str, Dict] = field(default_factory=dict)
+ filtered_objects: Dict[str, Dict] = field(default_factory=dict)
+ init_layout: Dict[str, Dict] = field(default_factory=dict)
+ optimization_model: Dict[str, Any] = field(default_factory=dict)
+ optimized_layout: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ stack_groups: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ final_layout: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+ messages: List[str] = field(default_factory=list)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_sa.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_sa.py
new file mode 100644
index 000000000..0c6e565ea
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_sa.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+import copy
+import os
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.sa_node3_5 import (
+ run_node_3_5,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager.sa_state import (
+ Tempo_SceneState,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.gym_export import _glb_scale_to_sim
+
+__all__ = ["optimize_scene_edit_layout_with_sa_node3_5"]
+
+
+def _grid_to_region(grid_name: str) -> str:
+ grid = str(grid_name or "").strip()
+ mapping = {
+ "left_front": "front_left_area",
+ "center_front": "front_area",
+ "right_front": "front_right_area",
+ "left_center": "left_area",
+ "center": "center_area",
+ "right_center": "right_area",
+ "left_back": "back_left_area",
+ "center_back": "back_area",
+ "right_back": "back_right_area",
+ "front": "front_area",
+ "back": "back_area",
+ "left": "left_area",
+ "right": "right_area",
+ }
+ return mapping.get(grid, "unspecified")
+
+
+def _support_region_origin_xy(
+ support_region: dict[str, Any],
+) -> tuple[float, float]:
+ aabb_xy = support_region.get("aabb_xy")
+ if (
+ isinstance(aabb_xy, list)
+ and len(aabb_xy) == 2
+ and all(isinstance(item, list) and len(item) == 2 for item in aabb_xy)
+ ):
+ return float(aabb_xy[0][0]), float(aabb_xy[0][1])
+ center_xy = support_region.get("center_xy")
+ size_xy = support_region.get("size_xy")
+ if (
+ isinstance(center_xy, list)
+ and len(center_xy) == 2
+ and isinstance(size_xy, list)
+ and len(size_xy) == 2
+ ):
+ return (
+ float(center_xy[0]) - 0.5 * float(size_xy[0]),
+ float(center_xy[1]) - 0.5 * float(size_xy[1]),
+ )
+ return 0.0, 0.0
+
+
+def _resolve_asset_path(value: Any, *, output_root: Path) -> Path:
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
+
+
+def _generated_body_scale(generated_asset: dict[str, Any]) -> list[float]:
+ metric_scale = generated_asset.get("metric_scale")
+ scale_factor = 1.0
+ if isinstance(metric_scale, dict):
+ try:
+ scale_factor = float(metric_scale.get("scale_factor", 1.0))
+ except (TypeError, ValueError):
+ scale_factor = 1.0
+ if scale_factor <= 0.0:
+ scale_factor = 1.0
+ body_scale = _glb_scale_to_sim([scale_factor, scale_factor, scale_factor])
+ return [float(value) for value in body_scale]
+
+
+def _build_objects_config_scaled(
+ *,
+ output_root: Path,
+ runtime_root: Path,
+ layout_items: dict[str, dict[str, Any]],
+ rigid_by_id: dict[str, dict[str, Any]],
+ generated_asset_by_id: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+ rigid_objects: list[dict[str, Any]] = []
+ gym_export_root = output_root / "gym_export"
+ for object_id in sorted(layout_items):
+ if object_id in rigid_by_id:
+ rigid = copy.deepcopy(rigid_by_id[object_id])
+ shape = rigid.get("shape")
+ if not isinstance(shape, dict):
+ raise ValueError(f"Existing rigid_object has no shape for {object_id}")
+ mesh_path = _resolve_asset_path(
+ gym_export_root / str(shape.get("fpath", "")),
+ output_root=output_root,
+ )
+ if not mesh_path.is_file():
+ raise FileNotFoundError(f"Mesh not found for {object_id}: {mesh_path}")
+ shape["fpath"] = os.path.relpath(mesh_path, runtime_root)
+ rigid["shape"] = shape
+ rigid_objects.append(rigid)
+ continue
+
+ generated_asset = generated_asset_by_id.get(object_id)
+ if generated_asset is None:
+ raise ValueError(f"Missing generated asset for {object_id}")
+ mesh_path = _resolve_asset_path(
+ generated_asset.get("simready_geometry_path")
+ or generated_asset.get("mesh_path"),
+ output_root=output_root,
+ )
+ if not mesh_path.is_file():
+ raise FileNotFoundError(f"Generated mesh not found for {object_id}: {mesh_path}")
+ rigid_objects.append(
+ {
+ "uid": object_id,
+ "shape": {
+ "shape_type": "Mesh",
+ "fpath": os.path.relpath(mesh_path, runtime_root),
+ "compute_uv": False,
+ },
+ "body_scale": _generated_body_scale(generated_asset),
+ "init_pos": [0.0, 0.0, 0.0],
+ "init_rot": [0.0, 0.0, 0.0],
+ }
+ )
+ return {"rigid_object": rigid_objects}
+
+
+def optimize_scene_edit_layout_with_sa_node3_5(
+ *,
+ output_root: Path,
+ support_region: dict[str, Any],
+ layout_items: dict[str, dict[str, Any]],
+ updated_relations: list[dict[str, Any]],
+ updated_grids: dict[str, str],
+ fixed_object_ids: list[str],
+ rigid_by_id: dict[str, dict[str, Any]],
+ generated_asset_by_id: dict[str, dict[str, Any]],
+ runtime_root: Path,
+) -> dict[str, Any]:
+ size_xy = support_region.get("size_xy")
+ if not (isinstance(size_xy, list) and len(size_xy) == 2):
+ raise ValueError("support_region.size_xy is required for SA scene-edit optimization.")
+ origin_x, origin_y = _support_region_origin_xy(support_region)
+
+ runtime_root.mkdir(parents=True, exist_ok=True)
+ objects_cfg = _build_objects_config_scaled(
+ output_root=output_root,
+ runtime_root=runtime_root,
+ layout_items=layout_items,
+ rigid_by_id=rigid_by_id,
+ generated_asset_by_id=generated_asset_by_id,
+ )
+ write_json(runtime_root / "objects_config_scaled.json", objects_cfg)
+
+ state = Tempo_SceneState()
+ state.table_size = (float(size_xy[0]) * 100.0, float(size_xy[1]) * 100.0)
+ state.raw_object_dict = {}
+ state.init_layout = {}
+
+ for object_id, item in sorted(layout_items.items()):
+ center_xy = item.get("center_xy")
+ if not (isinstance(center_xy, list) and len(center_xy) == 2):
+ continue
+ relation: dict[str, list[Any]] = {
+ "left_of": [],
+ "right_of": [],
+ "front_of": [],
+ "back_of": [],
+ }
+ state.raw_object_dict[object_id] = {
+ "region": _grid_to_region(updated_grids.get(object_id, "")),
+ "contact": {},
+ "relation": relation,
+ }
+ if object_id in fixed_object_ids:
+ state.raw_object_dict[object_id]["coordinate"] = [
+ (float(center_xy[0]) - origin_x) * 100.0,
+ (float(center_xy[1]) - origin_y) * 100.0,
+ ]
+
+ init_rot = [0.0, 0.0, 0.0]
+ rigid = rigid_by_id.get(object_id)
+ if isinstance(rigid, dict):
+ value = rigid.get("init_rot")
+ if isinstance(value, list) and len(value) >= 3:
+ init_rot = value
+ rot_deg = 0.0
+ try:
+ rot_deg = float(init_rot[2])
+ except (TypeError, ValueError, IndexError):
+ rot_deg = 0.0
+
+ state.init_layout[object_id] = {
+ "init_coordinate": [
+ (float(center_xy[0]) - origin_x) * 100.0,
+ (float(center_xy[1]) - origin_y) * 100.0,
+ rot_deg,
+ ]
+ }
+
+ for rel in updated_relations:
+ subject = str(rel.get("subject", "")).strip()
+ relation_name = str(rel.get("relation", "")).strip()
+ object_id = str(rel.get("object", "")).strip()
+ if subject not in state.raw_object_dict or object_id not in state.raw_object_dict:
+ continue
+ relation = state.raw_object_dict[subject]["relation"]
+ if relation_name == "left_of":
+ relation["left_of"].append(object_id)
+ elif relation_name == "right_of":
+ relation["right_of"].append(object_id)
+ elif relation_name == "front_of":
+ relation["front_of"].append(object_id)
+ elif relation_name in {"back_of", "behind"}:
+ relation["back_of"].append(object_id)
+
+ state = run_node_3_5(state, runtime_root)
+ centers: dict[str, list[float]] = {}
+ for object_id, item in (state.optimized_layout or {}).items():
+ center_2d = item.get("center_2d")
+ if not (isinstance(center_2d, list) and len(center_2d) == 2):
+ continue
+ centers[object_id] = [
+ float(center_2d[0]) + origin_x,
+ float(center_2d[1]) + origin_y,
+ ]
+
+ return {
+ "status": "ok",
+ "centers": centers,
+ "metadata": {
+ "messages": list(getattr(state, "messages", []) or []),
+ "optimization_model": getattr(state, "optimization_model", {}),
+ "stack_groups": getattr(state, "stack_groups", {}),
+ },
+ }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_support.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_support.py
new file mode 100644
index 000000000..f696554a8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/scene_edit_support.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+
+def compute_simready_glb_xy_size(
+ glb_path: Path,
+ *,
+ metric_scale: dict[str, Any] | None = None,
+) -> list[float]:
+ try:
+ import trimesh
+ except ImportError as exc:
+ raise RuntimeError("Scene edit layout requires trimesh.") from exc
+
+ scene = trimesh.load(glb_path, force="scene")
+ if isinstance(scene, trimesh.Trimesh):
+ mesh = scene
+ else:
+ dumped = scene.dump(concatenate=True)
+ mesh = (
+ dumped
+ if isinstance(dumped, trimesh.Trimesh)
+ else trimesh.util.concatenate(
+ [item for item in dumped if isinstance(item, trimesh.Trimesh)]
+ )
+ )
+ bounds = np.asarray(mesh.bounds, dtype=np.float64)
+ if bounds.shape != (2, 3):
+ raise ValueError(f"Invalid GLB bounds shape: {bounds.shape}")
+ size_x = float(bounds[1, 0] - bounds[0, 0])
+ size_y = float(bounds[1, 2] - bounds[0, 2])
+ scale_factor = 1.0
+ if isinstance(metric_scale, dict):
+ try:
+ scale_factor = float(metric_scale.get("scale_factor", 1.0))
+ except (TypeError, ValueError):
+ scale_factor = 1.0
+ if not np.isfinite(scale_factor) or scale_factor <= 0.0:
+ scale_factor = 1.0
+ return [
+ max(size_x * scale_factor, 1.0e-4),
+ max(size_y * scale_factor, 1.0e-4),
+ ]
+
+
+def build_xy_footprint(
+ *,
+ center_xy: list[float],
+ size_xy: list[float],
+) -> dict[str, Any]:
+ cx, cy = float(center_xy[0]), float(center_xy[1])
+ sx, sy = max(float(size_xy[0]), 0.0), max(float(size_xy[1]), 0.0)
+ half_x = 0.5 * sx
+ half_y = 0.5 * sy
+ return {
+ "unit": "m",
+ "center_xy": [cx, cy],
+ "aabb_xy": [
+ [cx - half_x, cy - half_y],
+ [cx + half_x, cy + half_y],
+ ],
+ "size_xy": [sx, sy],
+ }
+
+
+def clamp_center_to_support_region(
+ *,
+ center_xy: list[float],
+ size_xy: list[float],
+ support_region: dict[str, Any],
+) -> list[float]:
+ aabb_xy = support_region.get("aabb_xy")
+ if not (
+ isinstance(aabb_xy, list)
+ and len(aabb_xy) == 2
+ and all(isinstance(item, list) and len(item) == 2 for item in aabb_xy)
+ ):
+ return [float(center_xy[0]), float(center_xy[1])]
+ min_xy = np.asarray(aabb_xy[0], dtype=np.float64)
+ max_xy = np.asarray(aabb_xy[1], dtype=np.float64)
+ half = 0.5 * np.asarray(size_xy, dtype=np.float64)
+ center = np.asarray(center_xy, dtype=np.float64)
+ lower = min_xy + half
+ upper = max_xy - half
+ clamped = center.copy()
+ for axis in range(2):
+ if lower[axis] <= upper[axis]:
+ clamped[axis] = min(max(center[axis], lower[axis]), upper[axis])
+ else:
+ clamped[axis] = float(0.5 * (min_xy[axis] + max_xy[axis]))
+ return clamped.tolist()
+
+
+def support_region_default_center(
+ *,
+ support_region: dict[str, Any],
+) -> np.ndarray:
+ center_xy = support_region.get("center_xy")
+ if isinstance(center_xy, list) and len(center_xy) == 2:
+ return np.asarray(center_xy, dtype=np.float64)
+ aabb_xy = support_region.get("aabb_xy")
+ if (
+ isinstance(aabb_xy, list)
+ and len(aabb_xy) == 2
+ and all(isinstance(item, list) and len(item) == 2 for item in aabb_xy)
+ ):
+ min_xy = np.asarray(aabb_xy[0], dtype=np.float64)
+ max_xy = np.asarray(aabb_xy[1], dtype=np.float64)
+ return 0.5 * (min_xy + max_xy)
+ return np.zeros(2, dtype=np.float64)
+
+
+def support_region_grid_center(
+ *,
+ support_region: dict[str, Any],
+ grid_name: str,
+) -> np.ndarray:
+ aabb_xy = support_region.get("aabb_xy")
+ if not (
+ isinstance(aabb_xy, list)
+ and len(aabb_xy) == 2
+ and all(isinstance(item, list) and len(item) == 2 for item in aabb_xy)
+ ):
+ return np.zeros(2, dtype=np.float64)
+ min_xy = np.asarray(aabb_xy[0], dtype=np.float64)
+ max_xy = np.asarray(aabb_xy[1], dtype=np.float64)
+ size = max_xy - min_xy
+ cell = size / 3.0
+ grid_to_rc = {
+ "left_front": (0, 0),
+ "center_front": (1, 0),
+ "right_front": (2, 0),
+ "left_center": (0, 1),
+ "center": (1, 1),
+ "right_center": (2, 1),
+ "left_back": (0, 2),
+ "center_back": (1, 2),
+ "right_back": (2, 2),
+ "front": (1, 0),
+ "back": (1, 2),
+ "left": (0, 1),
+ "right": (2, 1),
+ }
+ col, row = grid_to_rc.get(grid_name, (1, 1))
+ center_x = min_xy[0] + (col + 0.5) * cell[0]
+ center_y = min_xy[1] + (row + 0.5) * cell[1]
+ return np.asarray([center_x, center_y], dtype=np.float64)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/schemas.py
new file mode 100644
index 000000000..015c41510
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/schemas.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/utils.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/utils.py
new file mode 100644
index 000000000..51a89b638
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/layout_manager/utils.py
@@ -0,0 +1,1398 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+
+
+from __future__ import annotations
+
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from scipy.optimize import minimize
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager import (
+ SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+ GravityDropRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+ relative_path,
+)
+
+__all__ = [
+ "_center_xy_aabb_layout",
+ "_object_scenes_xy_aabb_manifest",
+ "_settle_and_pack_object_footprints",
+ "_xy_aabb_overlap",
+ "_xy_union_area",
+ "_xy_union_bounds",
+]
+
+_WEIGHTS: dict[str, float] = {
+ "seed": 1.0,
+ "overlap": 200.0,
+ "grid": 3.0,
+}
+
+_SLSQP_OPTIONS: dict[str, Any] = {
+ "maxiter": 300,
+ "ftol": 1.0e-6,
+ "disp": False,
+}
+
+def _object_scenes_xy_aabb_manifest(
+ *,
+ object_scenes: list[tuple[str, Any]],
+ trimesh: Any,
+ unit_scale: float,
+ unit: str,
+) -> dict[str, Any]:
+ if not object_scenes:
+ return {
+ "status": "empty",
+ "unit": unit,
+ "object_count": 0,
+ }
+ bounds = [
+ np.asarray(
+ GeometryManager.scene_to_mesh(scene, trimesh=trimesh).bounds,
+ dtype=np.float64,
+ )
+ for _, scene in object_scenes
+ ]
+ union_bounds = np.vstack(
+ [
+ np.vstack([item[0] for item in bounds]).min(axis=0),
+ np.vstack([item[1] for item in bounds]).max(axis=0),
+ ]
+ )
+ min_xy = union_bounds[0, :2] * unit_scale
+ max_xy = union_bounds[1, :2] * unit_scale
+ size_xy = max_xy - min_xy
+ center_xy = 0.5 * (min_xy + max_xy)
+ return {
+ "status": "ok",
+ "unit": unit,
+ "object_count": len(object_scenes),
+ "min_xy": min_xy.tolist(),
+ "max_xy": max_xy.tolist(),
+ "center_xy": center_xy.tolist(),
+ "size_xy": size_xy.tolist(),
+ "area": float(size_xy[0] * size_xy[1]),
+ }
+
+
+
+def _settle_and_pack_object_footprints(
+ *,
+ object_scenes: list[tuple[str, Any]],
+ output_dir: Path,
+ output_root: Path,
+ trimesh: Any,
+) -> dict[str, Any]:
+ sim = SimulationManager(headless=True, sim_device="cpu")
+ footprint_items: list[dict[str, Any]] = []
+ settled_entries: list[dict[str, Any]] = []
+ output_axis_transform = GeometryManager.z_up_to_glb_y_up_transform()
+ output_to_internal_transform = np.linalg.inv(output_axis_transform)
+
+ with tempfile.TemporaryDirectory(prefix="p2s_footprint_drop_") as tmp_dir:
+ tmp_path = Path(tmp_dir)
+ for object_id, scene in object_scenes:
+ mesh = GeometryManager.scene_to_mesh(scene, trimesh=trimesh)
+ mesh_bounds = np.asarray(mesh.bounds, dtype=np.float64)
+ mesh_z_height = max(float(mesh_bounds[1][2] - mesh_bounds[0][2]), 0.0)
+ bottom_to_xy_plane_transform = GeometryManager.aabb_bottom_to_xy_plane_transform(
+ mesh_bounds
+ )
+ normalized_scene = GeometryManager.copy_scene_with_transform(
+ scene,
+ bottom_to_xy_plane_transform,
+ )
+ normalized_output_scene = GeometryManager.copy_scene_with_transform(
+ normalized_scene,
+ output_axis_transform,
+ )
+ pre_gravity_path = tmp_path / f"{object_id}_pre_gravity.glb"
+ normalized_output_scene.export(pre_gravity_path)
+ gravity_initial_height = mesh_z_height * 0.1
+
+ gravity_status = "ok"
+ gravity_transform = np.eye(4, dtype=np.float64)
+ gravity_reason = ""
+ try:
+ gravity_result = sim.run_gravity_simulation(
+ GravityDropRequest(
+ glb_path=pre_gravity_path,
+ max_convex_hull_num=16,
+ initial_height=gravity_initial_height,
+ )
+ )
+ gravity_transform = GeometryManager.matrix_from_json(
+ gravity_result.final_pose,
+ name=f"{object_id}.gravity_final_pose",
+ )
+ except Exception:
+ gravity_status = "failed"
+ gravity_reason = traceback.format_exc()
+
+ settled_origin_scene = GeometryManager.copy_scene_with_transform(
+ normalized_scene,
+ gravity_transform,
+ )
+ settled_mesh = GeometryManager.scene_to_mesh(
+ settled_origin_scene,
+ trimesh=trimesh,
+ )
+ settled_bounds = np.asarray(settled_mesh.bounds, dtype=np.float64)
+ settled_xy_center = GeometryManager.xy_aabb_center(settled_bounds)
+ settled_xy_size = GeometryManager.xy_aabb_size(settled_bounds)
+ settled_entries.append(
+ {
+ "id": object_id,
+ "scene": scene,
+ "bottom_to_xy_plane_transform": bottom_to_xy_plane_transform,
+ "mesh_z_height": mesh_z_height,
+ "gravity_initial_height": gravity_initial_height,
+ "gravity_transform": gravity_transform,
+ "settled_bounds": settled_bounds,
+ "settled_xy_center": settled_xy_center,
+ "settled_xy_size": settled_xy_size,
+ "gravity_status": gravity_status,
+ "gravity_reason": gravity_reason,
+ }
+ )
+
+ layout_result = _optimize_xy_aabb_footprint_layout(
+ object_ids=[str(entry["id"]) for entry in settled_entries],
+ xy_sizes={
+ str(entry["id"]): np.asarray(entry["settled_xy_size"], dtype=np.float64)
+ for entry in settled_entries
+ },
+ current_centers={
+ str(entry["id"]): GeometryManager.xy_aabb_center(
+ GeometryManager.scene_to_mesh(
+ entry["scene"],
+ trimesh=trimesh,
+ ).bounds
+ )
+ for entry in settled_entries
+ },
+ )
+ target_centers = layout_result["centers"]
+
+ packed_object_scenes: list[tuple[str, Any]] = []
+ object_layout_transforms: dict[str, np.ndarray] = {}
+ for entry in settled_entries:
+ object_id = str(entry["id"])
+ settled_bounds = np.asarray(entry["settled_bounds"], dtype=np.float64)
+ target_xy = target_centers[object_id]
+ placement_transform = np.eye(4, dtype=np.float64)
+ placement_transform[:3, 3] = [
+ float(target_xy[0] - entry["settled_xy_center"][0]),
+ float(target_xy[1] - entry["settled_xy_center"][1]),
+ -float(settled_bounds[0][2]),
+ ]
+ object_transform = (
+ placement_transform
+ @ entry["gravity_transform"]
+ @ entry["bottom_to_xy_plane_transform"]
+ )
+ packed_scene = GeometryManager.copy_scene_with_transform(
+ entry["scene"],
+ object_transform,
+ )
+ packed_object_scenes.append((object_id, packed_scene))
+ object_layout_transforms[object_id] = object_transform
+
+ packed_bounds = np.asarray(
+ GeometryManager.scene_to_mesh(packed_scene, trimesh=trimesh).bounds,
+ dtype=np.float64,
+ )
+ footprint_items.append(
+ {
+ "id": object_id,
+ "gravity_status": entry["gravity_status"],
+ "gravity_reason": entry["gravity_reason"],
+ "bottom_to_xy_plane_transform": entry[
+ "bottom_to_xy_plane_transform"
+ ].tolist(),
+ "mesh_z_height": entry["mesh_z_height"],
+ "gravity_initial_height": entry["gravity_initial_height"],
+ "gravity_transform": entry["gravity_transform"].tolist(),
+ "placement_transform": placement_transform.tolist(),
+ "object_layout_transform": object_transform.tolist(),
+ "settled_xy_size": entry["settled_xy_size"].tolist(),
+ "target_xy_center": target_xy.tolist(),
+ "packed_bounds": packed_bounds.tolist(),
+ }
+ )
+
+ manifest = {
+ "status": "ok",
+ "method": "per_object_gravity_then_geometry_knn_2d_aabb_relaxation",
+ "output_dir": relative_path(str(output_dir), output_root),
+ "internal_up_axis": [0.0, 0.0, 1.0],
+ "gravity_glb_up_axis": [0.0, 1.0, 0.0],
+ "internal_to_gravity_glb_transform": output_axis_transform.tolist(),
+ "gravity_glb_to_internal_transform": output_to_internal_transform.tolist(),
+ "layout_optimization": layout_result["metadata"],
+ "items": footprint_items,
+ }
+ return {
+ "object_scenes": packed_object_scenes,
+ "object_layout_transforms": object_layout_transforms,
+ "manifest": manifest,
+ }
+
+
+
+def _optimize_xy_aabb_footprint_layout(
+ *,
+ object_ids: list[str],
+ xy_sizes: dict[str, np.ndarray],
+ current_centers: dict[str, np.ndarray],
+ padding_ratio: float = 0.08,
+) -> dict[str, Any]:
+ if not object_ids:
+ return {
+ "centers": {},
+ "metadata": {
+ "method": "geometry_knn_2d_aabb_relaxation",
+ "iterations": 0,
+ "confidence_score": 1.0,
+ },
+ }
+
+ max_extent = max(
+ float(max(xy_sizes[object_id][0], xy_sizes[object_id][1]))
+ for object_id in object_ids
+ )
+ padding = max(max_extent * padding_ratio, 1e-3)
+ max_iterations = 300
+ overlap_strength = 1.0
+ neighbor_strength = 0.04
+ compactness_strength = 0.01
+ target_expansion_ratio = 1.2
+ knn_k = min(3, max(len(object_ids) - 1, 0))
+ centers = {
+ object_id: np.asarray(
+ current_centers.get(object_id, np.zeros(2, dtype=np.float64)),
+ dtype=np.float64,
+ ).copy()
+ for object_id in object_ids
+ }
+ centers = _center_xy_aabb_layout(
+ centers=centers,
+ xy_sizes=xy_sizes,
+ )
+ initial_centers = {
+ object_id: center.copy()
+ for object_id, center in centers.items()
+ }
+ initial_union_bounds = _xy_union_bounds(
+ centers=initial_centers,
+ xy_sizes=xy_sizes,
+ )
+ neighbor_edges = _knn_neighbor_edges(
+ centers=initial_centers,
+ k=knn_k,
+ )
+
+ iterations = 0
+ for iteration in range(max_iterations):
+ iterations = iteration + 1
+ max_delta = 0.0
+
+ for i, object_id in enumerate(object_ids):
+ for other_id in object_ids[i + 1 :]:
+ overlap = _xy_aabb_overlap(
+ center_a=centers[object_id],
+ size_a=xy_sizes[object_id],
+ center_b=centers[other_id],
+ size_b=xy_sizes[other_id],
+ padding=padding,
+ )
+ if overlap is None:
+ continue
+ overlap_x, overlap_y = overlap
+ if overlap_x <= overlap_y:
+ axis = 0
+ sign = (
+ -1.0
+ if centers[object_id][0] <= centers[other_id][0]
+ else 1.0
+ )
+ amount = overlap_x
+ else:
+ axis = 1
+ sign = (
+ -1.0
+ if centers[object_id][1] <= centers[other_id][1]
+ else 1.0
+ )
+ amount = overlap_y
+ shift = 0.5 * (amount + 1e-6) * overlap_strength
+ centers[object_id][axis] += sign * shift
+ centers[other_id][axis] -= sign * shift
+ max_delta = max(max_delta, shift)
+
+ for edge in neighbor_edges:
+ object_id = edge["object"]
+ neighbor_id = edge["neighbor"]
+ initial_delta = np.asarray(edge["initial_delta"], dtype=np.float64)
+ error = (centers[object_id] - centers[neighbor_id]) - initial_delta
+ correction = 0.5 * neighbor_strength * error
+ centers[object_id] -= correction
+ centers[neighbor_id] += correction
+ max_delta = max(max_delta, float(np.linalg.norm(correction)))
+
+ max_delta = max(
+ max_delta,
+ _apply_compactness_pull(
+ centers=centers,
+ xy_sizes=xy_sizes,
+ initial_union_bounds=initial_union_bounds,
+ target_expansion_ratio=target_expansion_ratio,
+ strength=compactness_strength,
+ ),
+ )
+
+ centers = _center_xy_aabb_layout(
+ centers=centers,
+ xy_sizes=xy_sizes,
+ )
+ if iteration >= 20 and max_delta < 1e-5:
+ break
+
+ diagnostics = _footprint_layout_diagnostics(
+ object_ids=object_ids,
+ centers=centers,
+ initial_centers=initial_centers,
+ xy_sizes=xy_sizes,
+ padding=padding,
+ initial_union_bounds=initial_union_bounds,
+ )
+ metadata = {
+ "method": "geometry_knn_2d_aabb_relaxation",
+ "relation_usage": "disabled",
+ "iterations": iterations,
+ "padding": padding,
+ "padding_ratio": padding_ratio,
+ "max_iterations": max_iterations,
+ "overlap_strength": overlap_strength,
+ "neighbor_strength": neighbor_strength,
+ "compactness_strength": compactness_strength,
+ "target_expansion_ratio": target_expansion_ratio,
+ "knn_k": knn_k,
+ "neighbor_edges": neighbor_edges,
+ "final_centers": {
+ object_id: centers[object_id].tolist()
+ for object_id in object_ids
+ },
+ **diagnostics,
+ }
+ return {"centers": centers, "metadata": metadata}
+
+
+
+def _knn_neighbor_edges(
+ *,
+ centers: dict[str, np.ndarray],
+ k: int,
+) -> list[dict[str, Any]]:
+ if k <= 0 or len(centers) < 2:
+ return []
+ object_ids = sorted(centers)
+ edges: list[dict[str, Any]] = []
+ seen: set[tuple[str, str]] = set()
+ for object_id in object_ids:
+ distances = []
+ for other_id in object_ids:
+ if other_id == object_id:
+ continue
+ distance = float(np.linalg.norm(centers[object_id] - centers[other_id]))
+ distances.append((distance, other_id))
+ for _, neighbor_id in sorted(distances)[:k]:
+ edge_key = tuple(sorted((object_id, neighbor_id)))
+ if edge_key in seen:
+ continue
+ seen.add(edge_key)
+ edges.append(
+ {
+ "object": object_id,
+ "neighbor": neighbor_id,
+ "initial_delta": (
+ centers[object_id] - centers[neighbor_id]
+ ).tolist(),
+ }
+ )
+ return edges
+
+
+
+def _apply_compactness_pull(
+ *,
+ centers: dict[str, np.ndarray],
+ xy_sizes: dict[str, np.ndarray],
+ initial_union_bounds: np.ndarray,
+ target_expansion_ratio: float,
+ strength: float,
+) -> float:
+ current_bounds = _xy_union_bounds(centers=centers, xy_sizes=xy_sizes)
+ expansion_ratio = _xy_union_area(current_bounds) / max(
+ _xy_union_area(initial_union_bounds),
+ 1.0e-12,
+ )
+ if expansion_ratio <= target_expansion_ratio:
+ return 0.0
+ excess = min(expansion_ratio / target_expansion_ratio - 1.0, 1.0)
+ union_center = 0.5 * (current_bounds[0] + current_bounds[1])
+ factor = strength * excess
+ max_delta = 0.0
+ for object_id, center in centers.items():
+ delta = factor * (union_center - center)
+ centers[object_id] = center + delta
+ max_delta = max(max_delta, float(np.linalg.norm(delta)))
+ return max_delta
+
+
+
+def _footprint_layout_diagnostics(
+ *,
+ object_ids: list[str],
+ centers: dict[str, np.ndarray],
+ initial_centers: dict[str, np.ndarray],
+ xy_sizes: dict[str, np.ndarray],
+ padding: float,
+ initial_union_bounds: np.ndarray,
+) -> dict[str, Any]:
+ remaining_overlaps = _remaining_xy_overlaps(
+ object_ids=object_ids,
+ centers=centers,
+ xy_sizes=xy_sizes,
+ padding=padding,
+ )
+ displacements = [
+ float(np.linalg.norm(centers[object_id] - initial_centers[object_id]))
+ for object_id in object_ids
+ ]
+ current_union_bounds = _xy_union_bounds(centers=centers, xy_sizes=xy_sizes)
+ expansion_ratio = _xy_union_area(current_union_bounds) / max(
+ _xy_union_area(initial_union_bounds),
+ 1.0e-12,
+ )
+ average_displacement = float(np.mean(displacements)) if displacements else 0.0
+ max_displacement = float(np.max(displacements)) if displacements else 0.0
+ confidence_score = _footprint_confidence_score(
+ remaining_overlap_count=len(remaining_overlaps),
+ average_displacement=average_displacement,
+ max_extent=max(
+ float(max(xy_sizes[object_id][0], xy_sizes[object_id][1]))
+ for object_id in object_ids
+ )
+ if object_ids
+ else 1.0,
+ expansion_ratio=expansion_ratio,
+ )
+ return {
+ "remaining_overlaps": remaining_overlaps,
+ "average_displacement": average_displacement,
+ "max_displacement": max_displacement,
+ "union_aabb_expansion_ratio": expansion_ratio,
+ "confidence_score": confidence_score,
+ }
+
+
+
+def _remaining_xy_overlaps(
+ *,
+ object_ids: list[str],
+ centers: dict[str, np.ndarray],
+ xy_sizes: dict[str, np.ndarray],
+ padding: float,
+) -> list[dict[str, Any]]:
+ overlaps: list[dict[str, Any]] = []
+ for index, object_id in enumerate(object_ids):
+ for other_id in object_ids[index + 1 :]:
+ overlap = _xy_aabb_overlap(
+ center_a=centers[object_id],
+ size_a=xy_sizes[object_id],
+ center_b=centers[other_id],
+ size_b=xy_sizes[other_id],
+ padding=padding,
+ )
+ if overlap is None:
+ continue
+ overlaps.append(
+ {
+ "object": object_id,
+ "other": other_id,
+ "overlap_x": overlap[0],
+ "overlap_y": overlap[1],
+ }
+ )
+ return overlaps
+
+
+
+def _footprint_confidence_score(
+ *,
+ remaining_overlap_count: int,
+ average_displacement: float,
+ max_extent: float,
+ expansion_ratio: float,
+) -> float:
+ displacement_scale = max(max_extent, 1.0e-6)
+ overlap_penalty = min(0.35 * remaining_overlap_count, 0.7)
+ displacement_penalty = min(0.1 * average_displacement / displacement_scale, 0.2)
+ expansion_penalty = min(max(expansion_ratio - 1.2, 0.0) * 0.25, 0.2)
+ return float(
+ np.clip(
+ 1.0
+ - overlap_penalty
+ - displacement_penalty
+ - expansion_penalty,
+ 0.0,
+ 1.0,
+ )
+ )
+
+
+
+def _center_xy_aabb_layout(
+ *,
+ centers: dict[str, np.ndarray],
+ xy_sizes: dict[str, np.ndarray],
+) -> dict[str, np.ndarray]:
+ if not centers:
+ return centers
+ bounds_min = []
+ bounds_max = []
+ for object_id, center in centers.items():
+ half_size = 0.5 * np.asarray(xy_sizes[object_id], dtype=np.float64)
+ bounds_min.append(center - half_size)
+ bounds_max.append(center + half_size)
+ clutter_center = 0.5 * (
+ np.vstack(bounds_min).min(axis=0)
+ + np.vstack(bounds_max).max(axis=0)
+ )
+ return {
+ object_id: np.asarray(center, dtype=np.float64) - clutter_center
+ for object_id, center in centers.items()
+ }
+
+
+
+def _xy_union_bounds(
+ *,
+ centers: dict[str, np.ndarray],
+ xy_sizes: dict[str, np.ndarray],
+) -> np.ndarray:
+ if not centers:
+ return np.zeros((2, 2), dtype=np.float64)
+ bounds_min = []
+ bounds_max = []
+ for object_id, center in centers.items():
+ half_size = 0.5 * np.asarray(xy_sizes[object_id], dtype=np.float64)
+ bounds_min.append(np.asarray(center, dtype=np.float64) - half_size)
+ bounds_max.append(np.asarray(center, dtype=np.float64) + half_size)
+ return np.vstack(
+ [
+ np.vstack(bounds_min).min(axis=0),
+ np.vstack(bounds_max).max(axis=0),
+ ]
+ )
+
+
+
+def _xy_union_area(bounds: np.ndarray) -> float:
+ bounds = np.asarray(bounds, dtype=np.float64)
+ size = np.maximum(bounds[1] - bounds[0], 1.0e-9)
+ return float(size[0] * size[1])
+
+
+
+def _xy_aabb_overlap(
+ *,
+ center_a: np.ndarray,
+ size_a: np.ndarray,
+ center_b: np.ndarray,
+ size_b: np.ndarray,
+ padding: float,
+) -> tuple[float, float] | None:
+ half_a = 0.5 * np.asarray(size_a, dtype=np.float64)
+ half_b = 0.5 * np.asarray(size_b, dtype=np.float64)
+ delta = np.abs(
+ np.asarray(center_b, dtype=np.float64)
+ - np.asarray(center_a, dtype=np.float64)
+ )
+ overlap = half_a + half_b + padding - delta
+ if float(overlap[0]) <= 0.0 or float(overlap[1]) <= 0.0:
+ return None
+ return float(overlap[0]), float(overlap[1])
+# http://www.apache.org/licenses/LICENSE-2.0
+# distributed under the License is distributed on an "AS IS" BASIS,
+
+
+
+from typing import Any
+
+import numpy as np
+
+__all__: list[str] = []
+
+def _transitive_closure(
+ nodes: list[str],
+ edges: list[tuple[str, str]],
+) -> list[tuple[str, str]]:
+ """Floyd–Warshall transitive closure over a small set of nodes."""
+ if not nodes or not edges:
+ return list(edges)
+ idx = {n: i for i, n in enumerate(nodes)}
+ n = len(nodes)
+ adj = [[False] * n for _ in range(n)]
+ for src, dst in edges:
+ if src in idx and dst in idx:
+ adj[idx[src]][idx[dst]] = True
+ for k in range(n):
+ for i in range(n):
+ if adj[i][k]:
+ row_k = adj[k]
+ row_i = adj[i]
+ for j in range(n):
+ if row_k[j]:
+ row_i[j] = True
+ closed: list[tuple[str, str]] = []
+ for i in range(n):
+ for j in range(n):
+ if adj[i][j]:
+ closed.append((nodes[i], nodes[j]))
+ return closed
+
+
+
+def _longest_path_ranks(
+ nodes: list[str],
+ edges: list[tuple[str, str]],
+) -> dict[str, int]:
+ """Assign integer ranks satisfying ``(A,B)`` → rank[A] < rank[B].
+
+ Uses topological sort + longest-path DP. Returns a rank dict for every
+ node in *nodes* (default 0 for isolated nodes).
+ """
+ ranks: dict[str, int] = {n: 0 for n in nodes}
+ if not edges:
+ return ranks
+ # Build adjacency and in-degree
+ adj: dict[str, list[str]] = {n: [] for n in nodes}
+ in_deg: dict[str, int] = {n: 0 for n in nodes}
+ present = set(nodes)
+ for src, dst in edges:
+ if src not in present or dst not in present:
+ continue
+ adj[src].append(dst)
+ in_deg[dst] += 1
+ # Kahn topological sort
+ queue = [n for n in nodes if in_deg[n] == 0]
+ order: list[str] = []
+ while queue:
+ u = queue.pop(0)
+ order.append(u)
+ for v in adj[u]:
+ in_deg[v] -= 1
+ if in_deg[v] == 0:
+ queue.append(v)
+ # Longest path
+ for u in order:
+ for v in adj[u]:
+ if ranks[v] < ranks[u] + 1:
+ ranks[v] = ranks[u] + 1
+ # Remaining nodes (cycles / isolated) keep rank 0
+ return ranks
+
+
+
+def _layout_text_objects_grid(
+ *,
+ object_ids: list[str],
+ xy_sizes: dict[str, np.ndarray],
+ spatial_relations: list[dict[str, Any]],
+ table_constraints: list[dict[str, Any]] | None = None,
+ grid_spacing: float = 0.02,
+ padding_ratio: float = 0.08,
+) -> dict[str, Any]:
+ """Lay out text-scene objects — transitive closure + longest-path ranks.
+
+ 1. Transitive closure of left_of / front_of.
+ 2. Pick centre: explicit 9‑grid ʻcenterʼ, else highest-degree node.
+ 3. Longest-path rank assignment (left_of→X, front_of→Y).
+ 4. Shift 9‑grid anchors to their grid positions.
+ 5. Free objects auto‑wrap below.
+ 6. Convert ranks→XY using per‑column/row max sizes + gaps.
+ 7. SA point optimisation + mesh AABB collision cleanup.
+ """
+ if not object_ids:
+ return {
+ "centers": {},
+ "initial_centers": {},
+ "metadata": {
+ "method": "transitive_closure_longest_path_with_9grid",
+ "iterations": 0,
+ },
+ }
+
+ # Parse spatial relations.
+ left_of_edges: list[tuple[str, str]] = []
+ front_of_edges: list[tuple[str, str]] = []
+ seen: set[tuple[str, str, str]] = set()
+ for rel in spatial_relations:
+ subject = str(rel.get("subject") or "")
+ obj = str(rel.get("object") or "")
+ relation = str(rel.get("relation") or "")
+ if not subject or not obj or subject == obj:
+ continue
+ key = (subject, relation, obj)
+ if key in seen:
+ continue
+ seen.add(key)
+ if relation == "left_of":
+ left_of_edges.append((subject, obj))
+ elif relation == "front_of":
+ front_of_edges.append((subject, obj))
+
+ # Compute transitive closures.
+ left_of_closed = _transitive_closure(object_ids, left_of_edges)
+ front_of_closed = _transitive_closure(object_ids, front_of_edges)
+
+ # Parse nine-grid constraints.
+ # −Y = front, so front row = 0, back row = 2
+ _GRID_TO_RC: dict[str, tuple[int, int]] = {
+ "left_front": (0, 0), "center_front": (1, 0), "right_front": (2, 0),
+ "left_center": (0, 1), "center": (1, 1), "right_center": (2, 1),
+ "left_back": (0, 2), "center_back": (1, 2), "right_back": (2, 2),
+ "front": (1, 0), "back": (1, 2),
+ "left": (0, 1), "right": (2, 1),
+ }
+ grid_targets: dict[str, tuple[int, int]] = {}
+ for tc in (table_constraints or []):
+ asset = str(tc.get("asset") or "")
+ grid_name = str(tc.get("grid") or "").strip()
+ if asset in object_ids and grid_name in _GRID_TO_RC:
+ grid_targets[asset] = _GRID_TO_RC[grid_name]
+
+ # Select a center object when none is explicit.
+ auto_center_oid: str | None = None
+ has_explicit_center = any(
+ tc.get("grid") == "center" for tc in (table_constraints or [])
+ )
+ if not has_explicit_center:
+ # Degree = appearances in left_of + front_of (subject or object)
+ degree: dict[str, int] = {oid: 0 for oid in object_ids}
+ for src, dst in left_of_closed + front_of_closed:
+ if src in degree:
+ degree[src] += 1
+ if dst in degree:
+ degree[dst] += 1
+ max_deg = max(degree.values()) if degree else 0
+ if max_deg > 0:
+ candidates = [oid for oid, d in degree.items() if d == max_deg]
+ # Tie-breaker: largest AABB area
+ centre_oid = max(
+ candidates,
+ key=lambda oid: float(xy_sizes[oid][0]) * float(xy_sizes[oid][1]),
+ )
+ grid_targets[centre_oid] = (1, 1) # 9‑grid centre
+ auto_center_oid = centre_oid
+
+ # Derive ranks from the transitive closures.
+ x_rank = _longest_path_ranks(object_ids, left_of_closed)
+ # −Y = front: A front_of B → A.y < B.y → row[A] < row[B].
+ # _longest_path_ranks gives rank[src] < rank[dst]; edges are
+ # already (A,B) for "A front_of B", so NO reversal needed.
+ y_rank = _longest_path_ranks(object_ids, front_of_closed)
+
+ # Apply nine-grid shifts.
+ # Pin 9‑grid objects to their target ranks; shift all connected
+ # objects (both upstream and downstream) to preserve topology.
+ if grid_targets:
+ # Build undirected connected-components via relation edges
+ all_edges = left_of_closed + front_of_closed
+ neighbours: dict[str, set[str]] = {oid: set() for oid in object_ids}
+ for src, dst in all_edges:
+ if src in neighbours and dst in neighbours:
+ neighbours[src].add(dst)
+ neighbours[dst].add(src)
+ for oid in grid_targets:
+ neighbours.setdefault(oid, set())
+
+ # For each 9‑grid object, BFS the component and shift uniformly
+ shifted: set[str] = set()
+ for oid, (target_col, target_row) in grid_targets.items():
+ if oid in shifted:
+ continue
+ dx = target_col - x_rank.get(oid, 0)
+ dy = target_row - y_rank.get(oid, 0)
+
+ # BFS to collect the full connected component
+ component: set[str] = {oid}
+ queue = [oid]
+ while queue:
+ u = queue.pop(0)
+ for v in neighbours.get(u, set()):
+ if v not in component:
+ component.add(v)
+ queue.append(v)
+
+ for oid2 in component:
+ if oid2 not in grid_targets: # only shift non‑anchored objects
+ x_rank[oid2] = x_rank.get(oid2, 0) + dx
+ y_rank[oid2] = y_rank.get(oid2, 0) + dy
+ shifted.update(component)
+
+ # Propagate row and column alignment.
+ # left_of A B → same row (y_rank[A] = y_rank[B])
+ # front_of A B → same col (x_rank[A] = x_rank[B])
+ # Priority (higher wins): 9‑grid > higher degree > larger area.
+ _prio = {
+ oid: (
+ oid in grid_targets,
+ sum(1 for e in left_of_closed + front_of_closed if oid in e),
+ float(xy_sizes[oid][0]) * float(xy_sizes[oid][1]),
+ )
+ for oid in object_ids
+ }
+ for src, dst in left_of_closed:
+ if _prio[src] >= _prio[dst]:
+ y_rank[dst] = y_rank.get(src, 0)
+ else:
+ y_rank[src] = y_rank.get(dst, 0)
+ for src, dst in front_of_closed:
+ if _prio[src] >= _prio[dst]:
+ x_rank[dst] = x_rank.get(src, 0)
+ else:
+ x_rank[src] = x_rank.get(dst, 0)
+
+ # Normalise to >= 0
+ min_x = min(x_rank.values()) if x_rank else 0
+ min_y = min(y_rank.values()) if y_rank else 0
+ for oid in object_ids:
+ x_rank[oid] = x_rank.get(oid, 0) - min_x
+ y_rank[oid] = y_rank.get(oid, 0) - min_y
+
+ # Resolve cell collisions: spread objects sharing the same (col, row)
+ cell_occupants: dict[tuple[int, int], list[str]] = {}
+ for oid in object_ids:
+ cell = (x_rank[oid], y_rank[oid])
+ cell_occupants.setdefault(cell, []).append(oid)
+ for (col, row), occupants in cell_occupants.items():
+ if len(occupants) > 1:
+ for offset, oid in enumerate(occupants[1:], start=1):
+ x_rank[oid] = col + offset
+
+ # Place unconstrained objects in wrapped rows.
+ constrained = set()
+ for src, dst in left_of_closed + front_of_closed:
+ constrained.update([src, dst])
+ constrained.update(grid_targets)
+ free_objects = [oid for oid in object_ids if oid not in constrained]
+
+ if free_objects:
+ free_row = max(y_rank.values()) + 1 if y_rank else 0
+ # Max row width ≈ existing union width × 1.5 (at least 3 cols)
+ col_keys = list(x_rank.values())
+ existing_cols = max(col_keys) - min(col_keys) + 1 if col_keys else 1
+ max_cols_per_row = max(existing_cols, 3)
+ free_sorted = sorted(
+ free_objects,
+ key=lambda oid: float(xy_sizes[oid][0]),
+ reverse=True,
+ )
+ col = 0
+ row_offset = 0
+ for oid in free_sorted:
+ x_rank[oid] = col
+ y_rank[oid] = free_row + row_offset
+ col += 1
+ if col >= max_cols_per_row:
+ col = 0
+ row_offset += 1
+
+ # Convert ranks to XY positions.
+ col_widths: dict[int, float] = {}
+ row_heights: dict[int, float] = {}
+ for oid in object_ids:
+ c = x_rank[oid]
+ r = y_rank[oid]
+ col_widths[c] = max(col_widths.get(c, 0.0), float(xy_sizes[oid][0]))
+ row_heights[r] = max(row_heights.get(r, 0.0), float(xy_sizes[oid][1]))
+
+ x_cumsum: dict[int, float] = {}
+ cumulative = 0.0
+ for c in sorted(col_widths):
+ x_cumsum[c] = cumulative
+ cumulative += col_widths[c] + grid_spacing
+
+ y_cumsum: dict[int, float] = {}
+ cumulative = 0.0
+ for r in sorted(row_heights):
+ y_cumsum[r] = cumulative
+ cumulative += row_heights[r] + grid_spacing
+
+ centers: dict[str, np.ndarray] = {}
+ for oid in object_ids:
+ c = x_rank[oid]
+ r = y_rank[oid]
+ cx = x_cumsum[c] + 0.5 * float(xy_sizes[oid][0])
+ cy = y_cumsum[r] + 0.5 * float(xy_sizes[oid][1])
+ centers[oid] = np.array([cx, cy], dtype=np.float64)
+
+ centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+ initial_centers = {oid: c.copy() for oid, c in centers.items()}
+
+ # Snap initial grid positions as 9‑grid spring targets
+ grid_spring_targets: dict[str, np.ndarray] = {
+ oid: initial_centers[oid].copy()
+ for oid in grid_targets
+ if oid in initial_centers
+ }
+
+ # Optimize positions and remove mesh AABB collisions.
+ optimized = _optimize_text_layout_slp(
+ object_ids=object_ids,
+ xy_sizes=xy_sizes,
+ initial_centers=initial_centers,
+ left_of_edges=left_of_closed,
+ front_of_edges=front_of_closed,
+ grid_spring_targets=grid_spring_targets,
+ padding_ratio=padding_ratio,
+ )
+ centers = optimized["centers"]
+ optimization_metadata = optimized["metadata"]
+
+ # Collect layout metadata.
+ metadata = {
+ "method": "transitive_closure_longest_path_with_9grid_and_sa",
+ "grid_spacing": grid_spacing,
+ "auto_center_oid": auto_center_oid,
+ "has_explicit_center": has_explicit_center,
+ "table_constraint_count": len(grid_targets),
+ "left_of_count": len(left_of_edges),
+ "left_of_closed_count": len(left_of_closed),
+ "front_of_count": len(front_of_edges),
+ "front_of_closed_count": len(front_of_closed),
+ "free_object_count": len(free_objects),
+ "x_ranks": {oid: x_rank.get(oid, 0) for oid in object_ids},
+ "y_ranks": {oid: y_rank.get(oid, 0) for oid in object_ids},
+ "optimization": optimization_metadata,
+ }
+ return {
+ "centers": centers,
+ "initial_centers": initial_centers,
+ "metadata": metadata,
+ }
+# http://www.apache.org/licenses/LICENSE-2.0
+
+def _optimize_text_layout_slp(
+ *,
+ object_ids: list[str],
+ xy_sizes: dict[str, np.ndarray],
+ initial_centers: dict[str, np.ndarray],
+ left_of_edges: list[tuple[str, str]],
+ front_of_edges: list[tuple[str, str]],
+ grid_spring_targets: dict[str, np.ndarray],
+ padding_ratio: float,
+ fixed_object_ids: list[str] | None = None,
+) -> dict[str, Any]:
+ """Optimize 2D centres with scipy SLSQP, then remove mesh AABB overlap.
+
+ Mirroring the original example_optimization/SA pipeline:
+ - left_of / front_of → linear inequality constraints
+ - bounding box → variable bounds (2× initial union)
+ - seed / overlap / grid → soft penalties in the objective
+ - post‑solve collision cleanup on actual footprint AABBs
+ """
+ if not object_ids:
+ return {
+ "centers": {},
+ "metadata": {
+ "method": "text_slsqp_then_mesh_aabb_collision_removal",
+ "slsqp_iterations": 0,
+ "collision_iterations": 0,
+ },
+ }
+
+ max_extent = max(
+ float(max(xy_sizes[oid][0], xy_sizes[oid][1])) for oid in object_ids
+ )
+ padding = max(max_extent * padding_ratio, 1e-3)
+
+ initial_centers = {
+ oid: np.asarray(initial_centers[oid], dtype=np.float64).copy()
+ for oid in object_ids
+ }
+ fixed_ids = {
+ oid for oid in (fixed_object_ids or []) if oid in initial_centers
+ }
+ initial_union_bounds = _xy_union_bounds(
+ centers=initial_centers,
+ xy_sizes=xy_sizes,
+ )
+
+ index_by_id = {oid: i for i, oid in enumerate(object_ids)}
+ x0 = _pack_centers(object_ids, initial_centers)
+
+ # Build linear inequality constraints for left_of and front_of.
+ constraints: list[dict[str, Any]] = []
+ _build_relation_constraints(
+ constraints=constraints,
+ object_ids=object_ids,
+ index_by_id=index_by_id,
+ xy_sizes=xy_sizes,
+ left_of_edges=left_of_edges,
+ front_of_edges=front_of_edges,
+ padding=padding,
+ )
+
+ # Bound variables to twice the initial union size.
+ init_size = initial_union_bounds[1] - initial_union_bounds[0]
+ margin = init_size * 0.5 # 50 % each side → 2× total
+ bounds = []
+ for oid in object_ids:
+ if oid in fixed_ids:
+ bounds.append(
+ (
+ float(initial_centers[oid][0]),
+ float(initial_centers[oid][0]),
+ )
+ )
+ bounds.append(
+ (
+ float(initial_centers[oid][1]),
+ float(initial_centers[oid][1]),
+ )
+ )
+ continue
+ bounds.append(
+ (
+ float(initial_union_bounds[0, 0] - margin[0]),
+ float(initial_union_bounds[1, 0] + margin[0]),
+ )
+ ) # x
+ bounds.append(
+ (
+ float(initial_union_bounds[0, 1] - margin[1]),
+ float(initial_union_bounds[1, 1] + margin[1]),
+ )
+ ) # y
+
+ # Define the optimization objective.
+ def _objective(xvec: np.ndarray) -> float:
+ centers = _unpack_centers(object_ids, xvec)
+ loss = 0.0
+
+ # seed: stay close to initial positions
+ for oid in object_ids:
+ delta = centers[oid] - initial_centers[oid]
+ loss += _WEIGHTS["seed"] * float(np.dot(delta, delta))
+
+ # overlap: AABB overlap area penalty
+ for i, oid in enumerate(object_ids):
+ for other_id in object_ids[i + 1 :]:
+ ov = _xy_aabb_overlap(
+ center_a=centers[oid],
+ size_a=xy_sizes[oid],
+ center_b=centers[other_id],
+ size_b=xy_sizes[other_id],
+ padding=padding,
+ )
+ if ov is not None:
+ loss += _WEIGHTS["overlap"] * float(ov[0] * ov[1])
+
+ # grid: spring toward 9‑grid targets
+ for oid, target in grid_spring_targets.items():
+ if oid not in centers:
+ continue
+ delta = centers[oid] - target
+ loss += _WEIGHTS["grid"] * float(np.dot(delta, delta))
+
+ return float(loss)
+
+ # Solve the constrained optimization problem.
+ slsqp_result: dict[str, Any] = {"success": False, "nit": 0, "message": ""}
+ try:
+ result = minimize(
+ _objective,
+ x0,
+ method="SLSQP",
+ bounds=bounds,
+ constraints=constraints,
+ options=_SLSQP_OPTIONS,
+ )
+ slsqp_result = {
+ "success": bool(result.success),
+ "nit": int(getattr(result, "nit", 0)),
+ "message": str(result.message),
+ "fun": float(result.fun) if result.fun is not None else None,
+ }
+ if result.success:
+ x_opt = result.x
+ else:
+ # SLSQP failed — fall back to seed positions
+ x_opt = x0.copy()
+ except Exception:
+ x_opt = x0.copy()
+ slsqp_result["message"] = "SLSQP raised an exception; using seed positions."
+
+ centers = _unpack_centers(object_ids, x_opt)
+ centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+ # Remove residual collisions.
+ centers, collision_metadata = _remove_mesh_aabb_collisions(
+ object_ids=object_ids,
+ xy_sizes=xy_sizes,
+ centers=centers,
+ initial_centers=initial_centers,
+ left_of_edges=left_of_edges,
+ front_of_edges=front_of_edges,
+ padding=padding,
+ fixed_object_ids=fixed_ids,
+ )
+ centers = _center_xy_aabb_layout(centers=centers, xy_sizes=xy_sizes)
+
+ # Collect optimization metadata.
+ diagnostics = _footprint_layout_diagnostics(
+ object_ids=object_ids,
+ centers=centers,
+ initial_centers=initial_centers,
+ xy_sizes=xy_sizes,
+ padding=padding,
+ initial_union_bounds=initial_union_bounds,
+ )
+ metadata: dict[str, Any] = {
+ "method": "text_slsqp_then_mesh_aabb_collision_removal",
+ "relation_usage": "left_of_front_of_hard_constraints",
+ "padding": float(padding),
+ "padding_ratio": float(padding_ratio),
+ "weights": dict(_WEIGHTS),
+ "fixed_object_ids": sorted(fixed_ids),
+ "slsqp": slsqp_result,
+ "bounds_expansion": 2.0,
+ "initial_union_size": init_size.tolist(),
+ **collision_metadata,
+ "final_centers": {
+ oid: centers[oid].tolist() for oid in object_ids
+ },
+ **diagnostics,
+ }
+ return {"centers": centers, "metadata": metadata}
+
+
+# Build relation constraints.
+
+
+def _build_relation_constraints(
+ *,
+ constraints: list[dict[str, Any]],
+ object_ids: list[str],
+ index_by_id: dict[str, int],
+ xy_sizes: dict[str, np.ndarray],
+ left_of_edges: list[tuple[str, str]],
+ front_of_edges: list[tuple[str, str]],
+ padding: float,
+) -> None:
+ """Append SLSQP inequality constraints for left_of / front_of edges."""
+
+ for subject, obj in left_of_edges:
+ if subject not in index_by_id or obj not in index_by_id:
+ continue
+ i_a = index_by_id[subject]
+ i_b = index_by_id[obj]
+ # A.x + gap ≤ B.x → B.x - A.x - gap ≥ 0
+ gap = (
+ 0.5 * float(xy_sizes[subject][0])
+ + 0.5 * float(xy_sizes[obj][0])
+ + padding
+ )
+ constraints.append(
+ {
+ "type": "ineq",
+ "fun": lambda x, ia=i_a, ib=i_b, g=gap: float(
+ x[2 * ib] - x[2 * ia] - g
+ ),
+ }
+ )
+
+ for subject, obj in front_of_edges:
+ if subject not in index_by_id or obj not in index_by_id:
+ continue
+ i_a = index_by_id[subject]
+ i_b = index_by_id[obj]
+ # A.y + gap ≤ B.y → B.y - A.y - gap ≥ 0
+ gap = (
+ 0.5 * float(xy_sizes[subject][1])
+ + 0.5 * float(xy_sizes[obj][1])
+ + padding
+ )
+ constraints.append(
+ {
+ "type": "ineq",
+ "fun": lambda x, ia=i_a, ib=i_b, g=gap: float(
+ x[2 * ib + 1] - x[2 * ia + 1] - g
+ ),
+ }
+ )
+
+
+# Remove AABB collisions.
+
+
+def _remove_mesh_aabb_collisions(
+ *,
+ object_ids: list[str],
+ xy_sizes: dict[str, np.ndarray],
+ centers: dict[str, np.ndarray],
+ initial_centers: dict[str, np.ndarray],
+ left_of_edges: list[tuple[str, str]],
+ front_of_edges: list[tuple[str, str]],
+ padding: float,
+ fixed_object_ids: set[str] | None = None,
+) -> tuple[dict[str, np.ndarray], dict[str, Any]]:
+ relation_pairs = set(left_of_edges + front_of_edges)
+ relation_pairs.update((b, a) for a, b in left_of_edges + front_of_edges)
+ fixed_ids = set(fixed_object_ids or set())
+ current = {
+ oid: np.asarray(center, dtype=np.float64).copy()
+ for oid, center in centers.items()
+ }
+ max_rounds = 80
+ total_pushes = 0
+ last_overlap_count = 0
+
+ for iteration in range(max_rounds):
+ overlaps = _mesh_aabb_collision_pairs(
+ object_ids=object_ids,
+ xy_sizes=xy_sizes,
+ centers=current,
+ padding=padding,
+ )
+ last_overlap_count = len(overlaps)
+ if not overlaps:
+ return current, {
+ "collision_iterations": iteration,
+ "collision_pushes": total_pushes,
+ "collision_remaining": 0,
+ "collision_removal": "iterative_mesh_aabb_push",
+ }
+ for item in overlaps:
+ object_a = item["object"]
+ object_b = item["other"]
+ axis = int(item["axis"])
+ sign = -1.0 if current[object_a][axis] <= current[object_b][axis] else 1.0
+ amount = 0.5 * (float(item["overlap"]) + 1.0e-6)
+ a_fixed = object_a in fixed_ids
+ b_fixed = object_b in fixed_ids
+ if a_fixed and b_fixed:
+ continue
+ if (object_a, object_b) in relation_pairs:
+ if a_fixed:
+ current[object_b][axis] -= sign * amount * 2.0
+ elif b_fixed:
+ current[object_a][axis] += sign * amount * 2.0
+ else:
+ current[object_a][axis] += sign * amount
+ current[object_b][axis] -= sign * amount
+ elif a_fixed:
+ current[object_b][axis] -= sign * amount * 2.0
+ elif b_fixed:
+ current[object_a][axis] += sign * amount * 2.0
+ else:
+ drift_a = np.linalg.norm(
+ current[object_a] - initial_centers[object_a]
+ )
+ drift_b = np.linalg.norm(
+ current[object_b] - initial_centers[object_b]
+ )
+ if drift_a <= drift_b:
+ current[object_a][axis] += sign * amount * 1.25
+ current[object_b][axis] -= sign * amount * 0.75
+ else:
+ current[object_a][axis] += sign * amount * 0.75
+ current[object_b][axis] -= sign * amount * 1.25
+ total_pushes += 1
+ current = _center_xy_aabb_layout(centers=current, xy_sizes=xy_sizes)
+
+ return current, {
+ "collision_iterations": max_rounds,
+ "collision_pushes": total_pushes,
+ "collision_remaining": last_overlap_count,
+ "collision_removal": "iterative_mesh_aabb_push",
+ }
+
+
+def _mesh_aabb_collision_pairs(
+ *,
+ object_ids: list[str],
+ xy_sizes: dict[str, np.ndarray],
+ centers: dict[str, np.ndarray],
+ padding: float,
+) -> list[dict[str, Any]]:
+ pairs: list[dict[str, Any]] = []
+ for i, oid in enumerate(object_ids):
+ for other_id in object_ids[i + 1 :]:
+ ov = _xy_aabb_overlap(
+ center_a=centers[oid],
+ size_a=xy_sizes[oid],
+ center_b=centers[other_id],
+ size_b=xy_sizes[other_id],
+ padding=padding,
+ )
+ if ov is None:
+ continue
+ axis = 0 if ov[0] <= ov[1] else 1
+ pairs.append(
+ {
+ "object": oid,
+ "other": other_id,
+ "axis": axis,
+ "overlap": float(ov[axis]),
+ "overlap_x": float(ov[0]),
+ "overlap_y": float(ov[1]),
+ }
+ )
+ pairs.sort(key=lambda item: item["overlap"], reverse=True)
+ return pairs
+
+
+# Pack and unpack center coordinates.
+
+
+def _pack_centers(
+ object_ids: list[str],
+ centers: dict[str, np.ndarray],
+) -> np.ndarray:
+ values: list[float] = []
+ for oid in object_ids:
+ c = np.asarray(centers[oid], dtype=np.float64)
+ values.extend([float(c[0]), float(c[1])])
+ return np.asarray(values, dtype=np.float64)
+
+
+def _unpack_centers(
+ object_ids: list[str],
+ xvec: np.ndarray,
+) -> dict[str, np.ndarray]:
+ return {
+ oid: np.asarray(
+ [xvec[2 * i], xvec[2 * i + 1]],
+ dtype=np.float64,
+ )
+ for i, oid in enumerate(object_ids)
+ }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py
new file mode 100644
index 000000000..21cf6c253
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/__init__.py
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.manager import (
+ MatplotlibManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+ RenderFootprintLayoutRequest,
+ RenderFootprintLayoutResult,
+ RenderImageComparisonRequest,
+ RenderImageComparisonResult,
+ RenderSupportRegionRequest,
+ RenderSupportRegionResult,
+ RenderXYComparisonRequest,
+ RenderXYComparisonResult,
+)
+
+__all__ = [
+ "MatplotlibManager",
+ "RenderFootprintLayoutRequest",
+ "RenderFootprintLayoutResult",
+ "RenderImageComparisonRequest",
+ "RenderImageComparisonResult",
+ "RenderSupportRegionRequest",
+ "RenderSupportRegionResult",
+ "RenderXYComparisonRequest",
+ "RenderXYComparisonResult",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py
new file mode 100644
index 000000000..4b5c58721
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/manager.py
@@ -0,0 +1,416 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Matplotlib manager for mesh visualization."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.collections import PolyCollection
+from matplotlib.patches import Rectangle
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+ RenderFootprintLayoutRequest,
+ RenderFootprintLayoutResult,
+ RenderImageComparisonRequest,
+ RenderImageComparisonResult,
+ RenderSupportRegionRequest,
+ RenderSupportRegionResult,
+ RenderXYComparisonRequest,
+ RenderXYComparisonResult,
+)
+
+__all__ = ["MatplotlibManager"]
+
+
+class MatplotlibManager:
+ """Manager for mesh visualization via matplotlib.
+
+ Wraps matplotlib rendering with typed request/response methods,
+ following the same pattern as service clients.
+ """
+
+ def __init__(
+ self,
+ *,
+ figsize: tuple[float, float] = (8, 8),
+ dpi: int = 180,
+ ) -> None:
+ """Initialize the matplotlib manager.
+
+ Args:
+ figsize: Default figure size for rendered images.
+ dpi: Output image resolution.
+ """
+ self._figsize = figsize
+ self._dpi = dpi
+
+ def render_footprint_layout(
+ self,
+ request: RenderFootprintLayoutRequest,
+ ) -> RenderFootprintLayoutResult:
+ """Render labeled XY footprints with full-length coordinate axes."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ if not request.object_ids:
+ return RenderFootprintLayoutResult(output_path=output_path)
+
+ centers = {
+ object_id: np.asarray(request.centers[object_id], dtype=float)
+ for object_id in request.object_ids
+ }
+ sizes = {
+ object_id: np.asarray(request.xy_sizes[object_id], dtype=float)
+ for object_id in request.object_ids
+ }
+ footprint_mins = np.vstack(
+ [
+ centers[object_id] - 0.5 * sizes[object_id]
+ for object_id in request.object_ids
+ ]
+ )
+ footprint_maxs = np.vstack(
+ [
+ centers[object_id] + 0.5 * sizes[object_id]
+ for object_id in request.object_ids
+ ]
+ )
+ data_min = footprint_mins.min(axis=0)
+ data_max = footprint_maxs.max(axis=0)
+ span = np.maximum(data_max - data_min, 1.0e-6)
+ padding = max(float(span.max()) * 0.12, 1.0e-3)
+ x_limits = (float(data_min[0] - padding), float(data_max[0] + padding))
+ y_limits = (float(data_min[1] - padding), float(data_max[1] + padding))
+
+ fig, ax = plt.subplots(figsize=self._figsize)
+ for object_id in request.object_ids:
+ center = centers[object_id]
+ size = sizes[object_id]
+ ax.add_patch(
+ Rectangle(
+ (center[0] - 0.5 * size[0], center[1] - 0.5 * size[1]),
+ size[0],
+ size[1],
+ facecolor=(0.35, 0.60, 0.95, 0.30),
+ edgecolor=(0.08, 0.22, 0.60, 1.0),
+ linewidth=1.5,
+ )
+ )
+ label = object_id.replace("interact_", "").removesuffix("_0")
+ ax.text(
+ center[0],
+ center[1],
+ label,
+ ha="center",
+ va="center",
+ fontsize=9,
+ color="black",
+ )
+
+ self._draw_full_xy_axes(ax, x_limits=x_limits, y_limits=y_limits)
+ ax.set_xlim(*x_limits)
+ ax.set_ylim(*y_limits)
+ ax.set_aspect("equal", adjustable="box")
+ ax.set_title(request.title)
+ ax.grid(True, linestyle=":", linewidth=0.6, alpha=0.30)
+ fig.tight_layout()
+ fig.savefig(output_path, dpi=self._dpi)
+ plt.close(fig)
+ return RenderFootprintLayoutResult(output_path=output_path)
+
+ def render_image_comparison(
+ self,
+ request: RenderImageComparisonRequest,
+ ) -> RenderImageComparisonResult:
+ """Render two images side by side with numbered labels."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ first_image = plt.imread(request.first_image_path.expanduser().resolve())
+ second_image = plt.imread(request.second_image_path.expanduser().resolve())
+
+ fig, axes = plt.subplots(1, 2, figsize=(12, 6))
+ for ax, image, label in (
+ (axes[0], first_image, request.first_label),
+ (axes[1], second_image, request.second_label),
+ ):
+ ax.imshow(image)
+ ax.text(
+ 0.03,
+ 0.92,
+ label,
+ transform=ax.transAxes,
+ ha="left",
+ va="top",
+ fontsize=16,
+ color="white",
+ bbox={
+ "boxstyle": "round,pad=0.25",
+ "facecolor": "black",
+ "edgecolor": "none",
+ "alpha": 0.55,
+ },
+ )
+ ax.axis("off")
+ fig.tight_layout()
+ fig.savefig(output_path, dpi=self._dpi, facecolor="white")
+ plt.close(fig)
+ return RenderImageComparisonResult(output_path=output_path)
+
+ @staticmethod
+ def _draw_full_xy_axes(
+ ax: Any,
+ *,
+ x_limits: tuple[float, float],
+ y_limits: tuple[float, float],
+ ) -> None:
+ """Draw axes across the full viewport, centered on the data bounds."""
+ axis_color = "#303030"
+ x_center = 0.5 * (x_limits[0] + x_limits[1])
+ y_center = 0.5 * (y_limits[0] + y_limits[1])
+ # Horizontal axis (X) — spans full width, positioned at vertical centre.
+ ax.annotate(
+ "",
+ xy=(x_limits[1], y_center),
+ xytext=(x_limits[0], y_center),
+ arrowprops={"arrowstyle": "->", "color": axis_color, "lw": 1.8},
+ zorder=8,
+ )
+ # Vertical axis (Y) — spans full height, positioned at horizontal centre.
+ ax.annotate(
+ "",
+ xy=(x_center, y_limits[1]),
+ xytext=(x_center, y_limits[0]),
+ arrowprops={"arrowstyle": "->", "color": axis_color, "lw": 1.8},
+ zorder=8,
+ )
+ x_span = x_limits[1] - x_limits[0]
+ y_span = y_limits[1] - y_limits[0]
+ ax.text(
+ x_limits[1] - 0.03 * x_span,
+ y_center + 0.02 * y_span,
+ "+X",
+ ha="right",
+ va="bottom",
+ color=axis_color,
+ fontsize=11,
+ )
+ ax.text(
+ x_center + 0.02 * x_span,
+ y_limits[1] - 0.03 * y_span,
+ "+Y",
+ ha="left",
+ va="top",
+ color=axis_color,
+ fontsize=11,
+ )
+ # Mark the origin at the centre.
+ ax.plot(x_center, y_center, "o", color=axis_color, markersize=6, zorder=9)
+ ax.text(
+ x_center + 0.015 * x_span,
+ y_center + 0.015 * y_span,
+ "Origin",
+ fontsize=8,
+ color=axis_color,
+ ha="left",
+ va="bottom",
+ zorder=9,
+ )
+
+ def render_selected_support_region(
+ self, request: RenderSupportRegionRequest
+ ) -> RenderSupportRegionResult:
+ """Render a mesh with the selected support region highlighted."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ vertices = np.asarray(request.mesh.vertices, dtype=float)
+ faces = np.asarray(request.mesh.faces, dtype=int)
+ selected_faces = faces[np.asarray(request.face_indices, dtype=int)]
+
+ fig = plt.figure(figsize=self._figsize)
+ ax = fig.add_subplot(111, projection="3d")
+ ax.add_collection3d(
+ Poly3DCollection(
+ vertices[faces],
+ facecolors=(0.65, 0.68, 0.72, 0.16),
+ edgecolors=(0.35, 0.37, 0.40, 0.08),
+ linewidths=0.15,
+ )
+ )
+ ax.add_collection3d(
+ Poly3DCollection(
+ vertices[selected_faces],
+ facecolors=(1.0, 0.18, 0.05, 0.88),
+ edgecolors=(0.55, 0.02, 0.0, 1.0),
+ linewidths=0.8,
+ )
+ )
+ self._set_equal_axes(ax, vertices)
+ ax.view_init(elev=25.0, azim=-45.0)
+ ax.set_xlabel("X")
+ ax.set_ylabel("Y")
+ ax.set_zlabel("Z")
+ ax.set_title("Selected Support Region")
+ fig.tight_layout()
+ fig.savefig(output_path, dpi=self._dpi)
+ plt.close(fig)
+ return RenderSupportRegionResult(output_path=output_path)
+
+ def render_xy_alignment_comparison(
+ self, request: RenderXYComparisonRequest
+ ) -> RenderXYComparisonResult:
+ """Render before/after XY projections for PCA yaw alignment."""
+ output_path = request.output_path.expanduser().resolve()
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ before_polygons, before_xy = self._xy_polygons_and_vertices(request.before_mesh)
+ after_polygons, after_xy = self._xy_polygons_and_vertices(request.after_mesh)
+ center, view_half = self._xy_view_bounds(before_xy, after_xy)
+
+ fig, axes = plt.subplots(1, 2, figsize=self._figsize)
+ self._draw_xy_projection(
+ axes[0],
+ before_polygons,
+ before_xy,
+ "Before PCA yaw",
+ center,
+ view_half,
+ )
+ self._draw_xy_projection(
+ axes[1],
+ after_polygons,
+ after_xy,
+ f"After PCA yaw ({request.angle_degrees:.2f} deg)",
+ center,
+ view_half,
+ )
+ fig.tight_layout()
+ fig.savefig(output_path, dpi=self._dpi)
+ plt.close(fig)
+ return RenderXYComparisonResult(output_path=output_path)
+
+ @staticmethod
+ def _xy_polygons_and_vertices(mesh: Any) -> tuple[Any, Any]:
+ vertices = np.asarray(mesh.vertices, dtype=float)
+ faces = np.asarray(mesh.faces, dtype=int)
+ return vertices[faces][:, :, :2], vertices[:, :2]
+
+ @staticmethod
+ def _xy_view_bounds(before_xy: Any, after_xy: Any) -> tuple[Any, float]:
+ values = np.concatenate([before_xy, after_xy], axis=0)
+ bounds_min = values.min(axis=0)
+ bounds_max = values.max(axis=0)
+ center = 0.5 * (bounds_min + bounds_max)
+ span = np.maximum(bounds_max - bounds_min, 1e-3)
+ view_half = max(float(span.max()) * 0.65, 0.5)
+ return center, view_half
+
+ def _draw_xy_projection(
+ self,
+ ax: Any,
+ polygons_xy: Any,
+ vertices_xy: Any,
+ title: str,
+ center: Any,
+ view_half: float,
+ ) -> None:
+ ax.add_collection(
+ PolyCollection(
+ polygons_xy,
+ facecolors=(0.24, 0.50, 0.90, 0.28),
+ edgecolors=(0.05, 0.16, 0.35, 0.20),
+ linewidths=0.20,
+ )
+ )
+ self._draw_xy_aabb(ax, vertices_xy)
+ self._add_xy_axes(ax, view_half)
+ ax.set_xlim(center[0] - view_half, center[0] + view_half)
+ ax.set_ylim(center[1] - view_half, center[1] + view_half)
+ ax.set_aspect("equal", adjustable="box")
+ ax.set_xlabel("X")
+ ax.set_ylabel("Y")
+ ax.set_title(title)
+ ax.grid(True, which="major", linestyle="-", linewidth=0.7, alpha=0.35)
+ ax.minorticks_on()
+ ax.grid(True, which="minor", linestyle=":", linewidth=0.45, alpha=0.25)
+
+ @staticmethod
+ def _draw_xy_aabb(ax: Any, vertices_xy: Any) -> None:
+ bounds_min = vertices_xy.min(axis=0)
+ bounds_max = vertices_xy.max(axis=0)
+ width, height = bounds_max - bounds_min
+ ax.add_patch(
+ Rectangle(
+ (bounds_min[0], bounds_min[1]),
+ width,
+ height,
+ fill=False,
+ edgecolor="#d62828",
+ linewidth=1.6,
+ linestyle="-",
+ alpha=0.95,
+ )
+ )
+
+ @staticmethod
+ def _add_xy_axes(ax: Any, view_half: float) -> None:
+ arrow_len = max(view_half * 0.35, 0.2)
+ ax.scatter([0.0], [0.0], color="black", s=22, zorder=8)
+ ax.text(0.0, 0.0, " Origin", fontsize=9, ha="left", va="bottom")
+ ax.arrow(
+ 0.0,
+ 0.0,
+ arrow_len,
+ 0.0,
+ width=arrow_len * 0.015,
+ head_width=arrow_len * 0.06,
+ head_length=arrow_len * 0.08,
+ color="#d62828",
+ length_includes_head=True,
+ zorder=9,
+ )
+ ax.text(arrow_len * 1.08, 0.0, "+X", color="#d62828", fontsize=11)
+ ax.arrow(
+ 0.0,
+ 0.0,
+ 0.0,
+ arrow_len,
+ width=arrow_len * 0.015,
+ head_width=arrow_len * 0.06,
+ head_length=arrow_len * 0.08,
+ color="#2a9d8f",
+ length_includes_head=True,
+ zorder=9,
+ )
+ ax.text(0.0, arrow_len * 1.08, "+Y", color="#2a9d8f", fontsize=11)
+
+ @staticmethod
+ def _set_equal_axes(ax: Any, vertices: Any) -> None:
+ mins = np.min(vertices, axis=0)
+ maxs = np.max(vertices, axis=0)
+ center = (mins + maxs) * 0.5
+ radius = max(float(np.max(maxs - mins)) * 0.5, 1e-6)
+ ax.set_xlim(center[0] - radius, center[0] + radius)
+ ax.set_ylim(center[1] - radius, center[1] + radius)
+ ax.set_zlim(center[2] - radius, center[2] + radius)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py
new file mode 100644
index 000000000..764383f38
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/matplotlib_manager/schemas.py
@@ -0,0 +1,101 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+ "RenderFootprintLayoutRequest",
+ "RenderFootprintLayoutResult",
+ "RenderImageComparisonRequest",
+ "RenderImageComparisonResult",
+ "RenderSupportRegionRequest",
+ "RenderSupportRegionResult",
+ "RenderXYComparisonRequest",
+ "RenderXYComparisonResult",
+]
+
+
+@dataclass(frozen=True)
+class RenderFootprintLayoutRequest:
+ """Request to render labeled top-down object footprints."""
+
+ object_ids: list[str]
+ centers: dict[str, Any]
+ xy_sizes: dict[str, Any]
+ output_path: Path
+ title: str = ""
+
+
+@dataclass(frozen=True)
+class RenderFootprintLayoutResult:
+ """Result of rendering a footprint layout."""
+
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderImageComparisonRequest:
+ """Request to render two labeled images side by side."""
+
+ first_image_path: Path
+ second_image_path: Path
+ output_path: Path
+ first_label: str = "1: normal"
+ second_label: str = "2: flipped"
+
+
+@dataclass(frozen=True)
+class RenderImageComparisonResult:
+ """Result of rendering an image comparison."""
+
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderSupportRegionRequest:
+ """Request to render a mesh with the selected support region highlighted."""
+
+ mesh: Any
+ face_indices: list[int]
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderSupportRegionResult:
+ """Result of rendering the support region."""
+
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderXYComparisonRequest:
+ """Request to render before/after XY projections for PCA yaw alignment."""
+
+ before_mesh: Any
+ after_mesh: Any
+ angle_degrees: float
+ output_path: Path
+
+
+@dataclass(frozen=True)
+class RenderXYComparisonResult:
+ """Result of rendering the XY alignment comparison."""
+
+ output_path: Path
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py
new file mode 100644
index 000000000..b85c8749f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/__init__.py
@@ -0,0 +1,56 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.manager import (
+ METRIC_SCALE_ENABLED,
+ SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+ EstimateMetricScalesRequest,
+ EstimateMetricScalesResult,
+ GlobalMetricScaleRequest,
+ MakeAssetSimreadyRequest,
+ MakeAssetSimreadyResult,
+ MakeTableSimreadyRequest,
+ MakeTableSimreadyResult,
+ MetricScaleObjectInput,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_image_metric_scale_messages,
+)
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ IMAGE_METRIC_SCALE_JSON_SCHEMA,
+)
+
+MetricScaleManager = SimreadyManager
+
+__all__ = [
+ "EstimateMetricScalesRequest",
+ "EstimateMetricScalesResult",
+ "GlobalMetricScaleRequest",
+ "IMAGE_METRIC_SCALE_JSON_SCHEMA",
+ "MakeAssetSimreadyRequest",
+ "MakeAssetSimreadyResult",
+ "MakeTableSimreadyRequest",
+ "MakeTableSimreadyResult",
+ "METRIC_SCALE_ENABLED",
+ "MetricScaleManager",
+ "MetricScaleObjectInput",
+ "SimreadyManager",
+ "build_image_metric_scale_messages",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py
new file mode 100644
index 000000000..cc9ab3d15
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/manager.py
@@ -0,0 +1,649 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.manager import (
+ DEFAULT_INPUT_UP_AXIS,
+ DEFAULT_UP_AXIS,
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager.schemas import (
+ AlignToAxisRequest,
+ CenterMeshRequest,
+ ConvertUpAxisRequest,
+ DetectTabletopRequest,
+ ExportMeshRequest,
+ LoadMeshRequest,
+ NormalizeRequest,
+ PlaceAbovePlaneRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.manager import (
+ MatplotlibManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager.schemas import (
+ RenderSupportRegionRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+ MakeAssetSimreadyRequest,
+ MakeAssetSimreadyResult,
+ MakeTableSimreadyRequest,
+ MakeTableSimreadyResult,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.manager import (
+ SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+ GravityDropRequest,
+)
+
+
+METRIC_SCALE_ENABLED = True
+
+from .utils import (
+ _as_transform,
+ _axis_angle_rotation,
+ _axis_conversion_transform,
+ _center_aabb_bottom_xy_at_origin,
+ _center_aabb_bottom_xy_at_origin_transform,
+ _normalize,
+ _orthogonal_axis,
+ _place_above_plane_transform,
+ _request_axis,
+ _rotation_between_vectors,
+ _scale_transform,
+ _translation_transform,
+)
+
+class SimreadyManager:
+ """Prepare generated GLB assets for simulation placement."""
+
+ def __init__(
+ self,
+ *,
+ geometry_manager: GeometryManager | None = None,
+ simulation_manager: SimulationManager | None = None,
+ matplotlib_manager: MatplotlibManager | None = None,
+ ) -> None:
+ self.geometry_manager = geometry_manager or GeometryManager()
+ self.simulation_manager = simulation_manager or SimulationManager()
+ self.matplotlib_manager = matplotlib_manager or MatplotlibManager()
+
+ def make_asset_simready(
+ self,
+ request: MakeAssetSimreadyRequest,
+ ) -> MakeAssetSimreadyResult:
+ input_path = request.input_path.expanduser().resolve()
+ output_path = request.output_path.expanduser().resolve()
+ if output_path.suffix.lower() != ".glb":
+ raise ValueError("Sim-ready asset output_path must be a .glb file.")
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ input_up_axis = _request_axis(request.input_up_axis, DEFAULT_INPUT_UP_AXIS)
+ raw_to_simready = np.eye(4, dtype=np.float64)
+ geom = self.geometry_manager
+ sim = self.simulation_manager
+
+ mesh = geom.load_mesh(LoadMeshRequest(mesh_path=input_path)).mesh
+
+ transform = _axis_conversion_transform(input_up_axis, DEFAULT_UP_AXIS)
+ raw_to_simready = transform @ raw_to_simready
+ mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=mesh,
+ input_up_axis=input_up_axis,
+ output_up_axis=DEFAULT_UP_AXIS,
+ )
+ ).mesh
+
+ center_result = geom.center_by_bbox(CenterMeshRequest(mesh=mesh))
+ mesh = center_result.mesh
+ transform = _translation_transform(-np.asarray(center_result.bbox_center))
+ raw_to_simready = transform @ raw_to_simready
+
+ transform = _place_above_plane_transform(mesh, request.ground_clearance)
+ raw_to_simready = transform @ raw_to_simready
+ mesh = geom.place_above_plane(
+ PlaceAbovePlaneRequest(mesh=mesh, clearance=request.ground_clearance)
+ ).mesh
+
+ pre_gravity_mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=mesh,
+ input_up_axis=DEFAULT_UP_AXIS,
+ output_up_axis=DEFAULT_INPUT_UP_AXIS,
+ )
+ ).mesh
+ pre_gravity_path = output_path.with_name(f".{output_path.stem}_pre_gravity.glb")
+ geom.export_mesh(
+ ExportMeshRequest(mesh=pre_gravity_mesh, output_path=pre_gravity_path)
+ )
+ try:
+ gravity_result = sim.run_gravity_simulation(
+ GravityDropRequest(glb_path=pre_gravity_path, max_convex_hull_num=16)
+ )
+
+ gravity_transform = _as_transform(gravity_result.final_pose)
+ settled_mesh = mesh.copy()
+ settled_mesh.apply_transform(gravity_transform)
+ raw_to_simready = gravity_transform @ raw_to_simready
+ transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+ settled_mesh.apply_transform(transform)
+ raw_to_simready = transform @ raw_to_simready
+
+ transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = _center_aabb_bottom_xy_at_origin(settled_mesh)
+
+ normalize_result = geom.normalize(NormalizeRequest(mesh=final_mesh))
+ final_mesh = normalize_result.mesh
+ transform = _scale_transform(normalize_result.scale_factor)
+ raw_to_simready = transform @ raw_to_simready
+
+ transform = _place_above_plane_transform(final_mesh, request.ground_clearance)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = geom.place_above_plane(
+ PlaceAbovePlaneRequest(
+ mesh=final_mesh,
+ clearance=request.ground_clearance,
+ )
+ ).mesh
+
+ transform = _axis_conversion_transform(DEFAULT_UP_AXIS, DEFAULT_INPUT_UP_AXIS)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=final_mesh,
+ input_up_axis=DEFAULT_UP_AXIS,
+ output_up_axis=DEFAULT_INPUT_UP_AXIS,
+ )
+ ).mesh
+
+ geom.export_mesh(ExportMeshRequest(mesh=final_mesh, output_path=output_path))
+ finally:
+ pre_gravity_path.unlink(missing_ok=True)
+
+ return MakeAssetSimreadyResult(
+ output_path=output_path,
+ transform_matrix=raw_to_simready.tolist(),
+ )
+
+ def make_table_simready(
+ self,
+ request: MakeTableSimreadyRequest,
+ ) -> MakeTableSimreadyResult:
+ input_path = request.input_path.expanduser().resolve()
+ output_path = request.output_path.expanduser().resolve()
+ if output_path.suffix.lower() != ".glb":
+ raise ValueError("Sim-ready table output_path must be a .glb file.")
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ input_up_axis = _request_axis(request.input_up_axis, DEFAULT_INPUT_UP_AXIS)
+ up_axis = _request_axis(request.up_axis, DEFAULT_UP_AXIS)
+ raw_to_simready = np.eye(4, dtype=np.float64)
+ geom = self.geometry_manager
+ sim = self.simulation_manager
+ mpl = self.matplotlib_manager
+
+ mesh = geom.load_mesh(LoadMeshRequest(mesh_path=input_path)).mesh
+
+ transform = _axis_conversion_transform(input_up_axis, DEFAULT_UP_AXIS)
+ raw_to_simready = transform @ raw_to_simready
+ mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=mesh,
+ input_up_axis=input_up_axis,
+ output_up_axis=DEFAULT_UP_AXIS,
+ )
+ ).mesh
+
+ center_result = geom.center_by_bbox(CenterMeshRequest(mesh=mesh))
+ mesh = center_result.mesh
+ transform = _translation_transform(-np.asarray(center_result.bbox_center))
+ raw_to_simready = transform @ raw_to_simready
+
+ detect_result = geom.detect_tabletop(DetectTabletopRequest(mesh=mesh))
+
+ transform = _axis_conversion_transform(detect_result.oriented_normal, up_axis)
+ raw_to_simready = transform @ raw_to_simready
+ mesh = geom.align_to_axis(
+ AlignToAxisRequest(
+ mesh=mesh,
+ source_axis=detect_result.oriented_normal,
+ target_axis=up_axis,
+ )
+ ).mesh
+
+ transform = _place_above_plane_transform(mesh, request.ground_clearance)
+ raw_to_simready = transform @ raw_to_simready
+ mesh = geom.place_above_plane(
+ PlaceAbovePlaneRequest(mesh=mesh, clearance=request.ground_clearance)
+ ).mesh
+
+ pre_gravity_mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=mesh,
+ input_up_axis=DEFAULT_UP_AXIS,
+ output_up_axis=DEFAULT_INPUT_UP_AXIS,
+ )
+ ).mesh
+ pre_gravity_path = output_path.with_name(f".{output_path.stem}_pre_gravity.glb")
+ geom.export_mesh(
+ ExportMeshRequest(mesh=pre_gravity_mesh, output_path=pre_gravity_path)
+ )
+ try:
+ gravity_result = sim.run_gravity_simulation(
+ GravityDropRequest(glb_path=pre_gravity_path, max_convex_hull_num=8)
+ )
+
+ gravity_transform = _as_transform(gravity_result.final_pose)
+ settled_mesh = mesh.copy()
+ settled_mesh.apply_transform(gravity_transform)
+ raw_to_simready = gravity_transform @ raw_to_simready
+ transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+ settled_mesh.apply_transform(transform)
+ raw_to_simready = transform @ raw_to_simready
+
+ settled_detect = geom.detect_tabletop(
+ DetectTabletopRequest(mesh=settled_mesh)
+ )
+
+ mpl.render_selected_support_region(
+ RenderSupportRegionRequest(
+ mesh=settled_mesh,
+ face_indices=settled_detect.selected.face_indices,
+ output_path=output_path.with_name(
+ f"{output_path.stem}_support_region.png"
+ ),
+ )
+ )
+
+ transform = _center_aabb_bottom_xy_at_origin_transform(settled_mesh)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = _center_aabb_bottom_xy_at_origin(settled_mesh)
+
+ normalize_result = geom.normalize(NormalizeRequest(mesh=final_mesh))
+ final_mesh = normalize_result.mesh
+ transform = _scale_transform(normalize_result.scale_factor)
+ raw_to_simready = transform @ raw_to_simready
+
+ transform = _place_above_plane_transform(final_mesh, request.ground_clearance)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = geom.place_above_plane(
+ PlaceAbovePlaneRequest(
+ mesh=final_mesh,
+ clearance=request.ground_clearance,
+ )
+ ).mesh
+
+ transform = _axis_conversion_transform(DEFAULT_UP_AXIS, DEFAULT_INPUT_UP_AXIS)
+ raw_to_simready = transform @ raw_to_simready
+ final_mesh = geom.convert_up_axis(
+ ConvertUpAxisRequest(
+ mesh=final_mesh,
+ input_up_axis=DEFAULT_UP_AXIS,
+ output_up_axis=DEFAULT_INPUT_UP_AXIS,
+ )
+ ).mesh
+
+ geom.export_mesh(ExportMeshRequest(mesh=final_mesh, output_path=output_path))
+ finally:
+ pre_gravity_path.unlink(missing_ok=True)
+
+ return MakeTableSimreadyResult(
+ output_path=output_path,
+ transform_matrix=raw_to_simready.tolist(),
+ )
+ @staticmethod
+ def estimate_metric_scales(request):
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+ EstimateMetricScalesRequest,
+ EstimateMetricScalesResult,
+ )
+ from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ call_structured_json_model_step,
+ )
+ from embodichain.gen_sim.prompt2scene.utils.io import write_json
+
+ object_payload = SimreadyManager.build_object_payload(request.objects)
+ raw_model_output_path = (
+ request.raw_output_path.expanduser().resolve()
+ if request.raw_output_path is not None
+ else None
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=request.llm,
+ schema=request.schema,
+ messages=request.messages,
+ context=request.context,
+ attempt_count=0,
+ raw_output_writer=(
+ (lambda payload: write_json(raw_model_output_path, payload))
+ if raw_model_output_path is not None
+ else None
+ ),
+ )
+ object_scales = SimreadyManager.apply_model_output(
+ object_payload=object_payload,
+ raw_model_output=raw_model_output,
+ method=request.method,
+ )
+ return EstimateMetricScalesResult(
+ status="ok",
+ object_scales=object_scales,
+ object_payload=object_payload,
+ raw_model_output=raw_model_output,
+ )
+
+
+ @staticmethod
+ def build_object_payload(objects):
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+ LoadMeshRequest,
+ )
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+ MetricScaleObjectInput,
+ )
+
+ geom = GeometryManager()
+ payload = []
+ for obj in objects:
+ mesh = geom.load_mesh(LoadMeshRequest(mesh_path=obj.mesh_path)).mesh
+ normalized_bbox_size_m = GeometryManager.mesh_metric_bbox_size(mesh)
+ payload.append({
+ "object_id": obj.object_id,
+ "object_name": obj.object_name,
+ "object_description": obj.object_description,
+ "normalized_bbox_method": "pca_bbox",
+ "normalized_bbox_size_m": normalized_bbox_size_m.tolist(),
+ "normalized_bbox_ratio": GeometryManager.bbox_ratio(
+ normalized_bbox_size_m
+ ).tolist(),
+ })
+ return payload
+
+
+ @staticmethod
+ def object_prompt_payload(objects):
+ return [
+ {
+ "object_id": obj.object_id,
+ "object_name": obj.object_name,
+ "object_description": obj.object_description,
+ }
+ for obj in objects
+ ]
+
+
+ @staticmethod
+ def apply_model_output(*, object_payload, raw_model_output, method):
+ import numpy as np
+
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+ )
+
+ model_by_id = {
+ str(item.get("object_id", "")): item
+ for item in raw_model_output.get("object_scales", [])
+ if isinstance(item, dict)
+ }
+ estimates = []
+ for p in object_payload:
+ oid = str(p.get("object_id", ""))
+ model_item = model_by_id.get(oid)
+ if model_item is None:
+ estimates.append(SimreadyManager.failure(
+ object_id=oid,
+ reason="missing_object_scale_from_model",
+ method=method,
+ ))
+ continue
+ estimates.append(SimreadyManager.select_candidate(
+ object_id=oid,
+ object_name=str(p.get("object_name", "")),
+ object_description=str(p.get("object_description", "")),
+ bbox_dims_cm=model_item.get("bbox_dims_cm", []),
+ confidence=float(model_item.get("confidence", 0.0)),
+ reason=str(model_item.get("reason", "")),
+ normalized_bbox_size_m=np.asarray(
+ p["normalized_bbox_size_m"], dtype=np.float64
+ ),
+ method=method,
+ ))
+ return estimates
+
+
+ @staticmethod
+ def apply_to_objects(*, objects, object_scales):
+ scale_by_id = {str(item.get("object_id", "")): item for item in object_scales}
+ for obj in objects:
+ oid = str(obj.get("id", ""))
+ if oid in scale_by_id:
+ obj["metric_scale"] = scale_by_id[oid]
+
+
+ @staticmethod
+ def select_candidate(*, object_id, object_name, object_description, bbox_dims_cm, confidence, reason, normalized_bbox_size_m, method):
+ import numpy as np
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+ )
+ try:
+ selected = SimreadyManager.compute_from_bbox_dims(
+ bbox_dims_cm=bbox_dims_cm,
+ confidence=confidence,
+ reason=reason,
+ normalized_bbox_size_m=normalized_bbox_size_m,
+ )
+ except (TypeError, ValueError):
+ return SimreadyManager.failure(
+ object_id=object_id,
+ reason="invalid_bbox_dims_cm",
+ method=method,
+ )
+ nbs_cm = np.asarray(normalized_bbox_size_m, dtype=np.float64) * 100.0
+ return {
+ "status": "ok",
+ "method": method,
+ "object_id": object_id,
+ "object_name": object_name,
+ "object_description": object_description,
+ "normalized_bbox_method": "pca_bbox",
+ "normalized_bbox_size_m": normalized_bbox_size_m.tolist(),
+ "normalized_bbox_size_cm": nbs_cm.tolist(),
+ "normalized_bbox_ratio": GeometryManager.bbox_ratio(
+ normalized_bbox_size_m
+ ).tolist(),
+ "bbox_dims_cm": selected["bbox_dims_cm"],
+ "axis_match": selected["axis_match"],
+ "scale_factor": selected["scale_factor"],
+ "confidence": selected["confidence"],
+ "reason": selected["reason"],
+ "unit_note": "scale_factor is not baked into this GLB.",
+ }
+
+
+ @staticmethod
+ def compute_from_bbox_dims(*, bbox_dims_cm, confidence, reason, normalized_bbox_size_m):
+ import numpy as np
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+ )
+ dims_cm = np.asarray([float(v) for v in bbox_dims_cm], dtype=np.float64)
+ if dims_cm.shape != (3,) or np.any(dims_cm <= 0.0):
+ raise ValueError("bbox_dims_cm must contain three positive values.")
+ nbs_cm = np.asarray(normalized_bbox_size_m, dtype=np.float64) * 100.0
+ axis_match = GeometryManager.best_axis_bbox_scale_match(
+ source_size_cm=nbs_cm,
+ target_size_cm=dims_cm,
+ )
+ return {
+ "bbox_dims_cm": dims_cm.tolist(),
+ "axis_match": axis_match,
+ "scale_factor": float(axis_match["scale_factor"]),
+ "confidence": confidence,
+ "reason": reason,
+ }
+
+
+ @staticmethod
+ def failure(*, object_id, reason, method):
+ return {
+ "status": "failed",
+ "method": method,
+ "object_id": object_id,
+ "scale_factor": 1.0,
+ "reason": reason,
+ }
+
+
+ @staticmethod
+ def set_for_all_objects(*, objects, status, reason, method):
+ for obj in objects:
+ obj["metric_scale"] = {
+ "status": status,
+ "method": method,
+ "object_id": str(obj.get("id", "")),
+ "scale_factor": 1.0,
+ "reason": reason,
+ }
+
+
+ @staticmethod
+ def compute_global_from_object_scenes(request):
+ import numpy as np
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+ )
+ from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager.schemas import (
+ GlobalMetricScaleRequest,
+ )
+
+ if not METRIC_SCALE_ENABLED:
+ return {
+ "status": "disabled",
+ "method": "metric_scale_disabled",
+ "scale_factor": 1.0,
+ "object_count": len(request.objects),
+ "used_count": 0,
+ "skipped_count": len(request.objects),
+ "used": [],
+ "skipped": [
+ {"id": str(item.get("id", "")), "reason": "metric_scale_disabled"}
+ for item in request.objects
+ ],
+ "unit_note": "Metric scale is disabled; GLBs keep simready size.",
+ }
+
+ used = []
+ skipped = []
+ object_by_id = {str(item.get("id", "")): item for item in request.objects}
+ for object_id, scene in request.object_scenes:
+ item = object_by_id.get(object_id)
+ if item is None:
+ skipped.append({"id": object_id, "reason": "missing_object_record"})
+ continue
+ ms = item.get("metric_scale")
+ if not isinstance(ms, dict):
+ skipped.append({"id": object_id, "reason": "missing_metric_scale"})
+ continue
+ if ms.get("status") != "ok":
+ skipped.append({"id": object_id, "reason": str(ms.get("status") or "not_ok")})
+ continue
+ sf = float(ms.get("scale_factor", 1.0))
+ if not np.isfinite(sf) or sf <= 0.0:
+ skipped.append({"id": object_id, "reason": "invalid_simready_scale_factor"})
+ continue
+ try:
+ srs = np.asarray([float(v) for v in ms.get("normalized_bbox_size_m", [])], dtype=np.float64)
+ except (TypeError, ValueError):
+ skipped.append({"id": object_id, "reason": "invalid_normalized_bbox_size_m"})
+ continue
+ if srs.shape != (3,) or np.any(srs <= 0.0):
+ skipped.append({"id": object_id, "reason": "invalid_normalized_bbox_size_m"})
+ continue
+ cs = np.asarray(
+ GeometryManager.mesh_metric_bbox_size(
+ GeometryManager.scene_to_mesh(scene)
+ ),
+ dtype=np.float64,
+ )
+ if cs.shape != (3,) or np.any(cs <= 0.0):
+ skipped.append({"id": object_id, "reason": "invalid_current_scene_bbox"})
+ continue
+ geo_ratio = np.sort(cs) / np.sort(srs)
+ geo_scale = float(np.median(geo_ratio))
+ if not np.isfinite(geo_scale) or geo_scale <= 0.0:
+ skipped.append({"id": object_id, "reason": "non_positive_geo_scale"})
+ continue
+ effective = sf / geo_scale
+ if not np.isfinite(effective) or effective <= 0.0:
+ skipped.append({"id": object_id, "reason": "non_positive_effective_scale"})
+ continue
+ used.append({
+ "id": object_id,
+ "effective_scale": effective,
+ "scale_factor_simready": sf,
+ "geo_scale": geo_scale,
+ "simready_bbox_size_m": srs.tolist(),
+ "simready_bbox_size_cm": (srs * 100.0).tolist(),
+ "current_scene_bbox_size_m": cs.tolist(),
+ "current_scene_bbox_size_cm": (cs * 100.0).tolist(),
+ "target_bbox_dims_cm": ms.get("bbox_dims_cm"),
+ "confidence": ms.get("confidence"),
+ })
+
+ if not used:
+ return {
+ "status": "fallback",
+ "method": "simready_reference_geo_ratio_mean_with_clamp",
+ "scale_factor": 1.0,
+ "raw_scale_factor": 1.0,
+ "was_clamped": False,
+ "clamp": {"min": request.min_scale, "max": request.max_scale},
+ "object_count": len(request.objects),
+ "used_count": 0,
+ "skipped_count": len(skipped),
+ "used": [],
+ "skipped": skipped,
+ "unit_note": "No valid metric scale available.",
+ }
+
+ raw = float(np.mean([u["effective_scale"] for u in used]))
+ sf = float(np.clip(raw, request.min_scale, request.max_scale))
+ return {
+ "status": "ok",
+ "method": "simready_reference_geo_ratio_mean_with_clamp",
+ "scale_factor": sf,
+ "raw_scale_factor": raw,
+ "was_clamped": bool(sf != raw),
+ "clamp": {"min": request.min_scale, "max": request.max_scale},
+ "object_count": len(request.objects),
+ "used_count": len(used),
+ "skipped_count": len(skipped),
+ "used": used,
+ "skipped": skipped,
+ "unit_note": (
+ f"Global scale via per-object metric scale / geo ratio, "
+ f"clamped to [{request.min_scale:.2f}, {request.max_scale:.2f}]."
+ ),
+ }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py
new file mode 100644
index 000000000..bc105c9df
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/schemas.py
@@ -0,0 +1,99 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+ "EstimateMetricScalesRequest",
+ "EstimateMetricScalesResult",
+ "GlobalMetricScaleRequest",
+ "MakeAssetSimreadyRequest",
+ "MakeAssetSimreadyResult",
+ "MakeTableSimreadyRequest",
+ "MakeTableSimreadyResult",
+ "MetricScaleObjectInput",
+]
+
+
+@dataclass(frozen=True)
+class MakeAssetSimreadyRequest:
+ input_path: Path
+ output_path: Path
+ input_up_axis: list[float] | None = None
+ up_axis: list[float] | None = None
+ ground_clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class MakeAssetSimreadyResult:
+ output_path: Path
+ transform_matrix: list[list[float]]
+
+
+@dataclass(frozen=True)
+class MakeTableSimreadyRequest:
+ input_path: Path
+ output_path: Path
+ input_up_axis: list[float] | None = None
+ up_axis: list[float] | None = None
+ ground_clearance: float = 0.01
+
+
+@dataclass(frozen=True)
+class MakeTableSimreadyResult:
+ output_path: Path
+ transform_matrix: list[list[float]]
+
+
+@dataclass(frozen=True)
+class MetricScaleObjectInput:
+ object_id: str
+ object_name: str
+ object_description: str
+ mesh_path: Path
+
+
+@dataclass(frozen=True)
+class EstimateMetricScalesRequest:
+ objects: list[MetricScaleObjectInput]
+ messages: list[dict[str, Any]]
+ schema: dict[str, Any]
+ llm: Any
+ context: str
+ method: str
+ step_name: str = "metric_scale"
+ raw_output_path: Path | None = None
+
+
+@dataclass(frozen=True)
+class EstimateMetricScalesResult:
+ status: str
+ object_scales: list[dict[str, Any]]
+ object_payload: list[dict[str, Any]]
+ raw_model_output: dict[str, Any] | None = None
+ reason: str = ""
+
+
+@dataclass(frozen=True)
+class GlobalMetricScaleRequest:
+ objects: list[dict[str, Any]]
+ object_scenes: list[tuple[str, Any]]
+ min_scale: float = 0.10
+ max_scale: float = 10.00
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/utils.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/utils.py
new file mode 100644
index 000000000..1a52dd13d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simready_manager/utils.py
@@ -0,0 +1,136 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+__all__ = [
+ "_as_transform",
+ "_axis_angle_rotation",
+ "_axis_conversion_transform",
+ "_center_aabb_bottom_xy_at_origin",
+ "_center_aabb_bottom_xy_at_origin_transform",
+ "_normalize",
+ "_orthogonal_axis",
+ "_place_above_plane_transform",
+ "_request_axis",
+ "_rotation_between_vectors",
+ "_scale_transform",
+ "_translation_transform",
+]
+
+
+def _request_axis(value: list[float] | None, default: tuple[float, float, float]) -> list[float]:
+ if value is not None:
+ return list(value)
+ return list(default)
+
+
+def _center_aabb_bottom_xy_at_origin(mesh: Any) -> Any:
+ bounds = mesh.bounds
+ bottom_center_x = (float(bounds[0][0]) + float(bounds[1][0])) * 0.5
+ bottom_center_y = (float(bounds[0][1]) + float(bounds[1][1])) * 0.5
+ centered = mesh.copy()
+ centered.apply_translation([-bottom_center_x, -bottom_center_y, 0.0])
+ return centered
+
+
+def _axis_conversion_transform(source_axis: list[float], target_axis: list[float]) -> np.ndarray:
+ source = _normalize(np.asarray(source_axis, dtype=np.float64))
+ target = _normalize(np.asarray(target_axis, dtype=np.float64))
+ return _rotation_between_vectors(source, target)
+
+
+def _place_above_plane_transform(mesh: Any, clearance: float) -> np.ndarray:
+ min_z = float(mesh.bounds[0][2])
+ return _translation_transform(np.array([0.0, 0.0, clearance - min_z]))
+
+
+def _center_aabb_bottom_xy_at_origin_transform(mesh: Any) -> np.ndarray:
+ bounds = mesh.bounds
+ bottom_center_x = (float(bounds[0][0]) + float(bounds[1][0])) * 0.5
+ bottom_center_y = (float(bounds[0][1]) + float(bounds[1][1])) * 0.5
+ return _translation_transform(np.array([-bottom_center_x, -bottom_center_y, 0.0]))
+
+
+def _translation_transform(translation: np.ndarray) -> np.ndarray:
+ transform = np.eye(4, dtype=np.float64)
+ transform[:3, 3] = translation
+ return transform
+
+
+def _scale_transform(scale: float) -> np.ndarray:
+ transform = np.eye(4, dtype=np.float64)
+ transform[0, 0] = scale
+ transform[1, 1] = scale
+ transform[2, 2] = scale
+ return transform
+
+
+def _as_transform(value: Any) -> np.ndarray:
+ if isinstance(value, np.ndarray) and value.shape == (4, 4):
+ return value.astype(np.float64)
+ raise TypeError(f"Cannot convert {type(value)} to 4x4 transform.")
+
+
+def _rotation_between_vectors(source: np.ndarray, target: np.ndarray) -> np.ndarray:
+ source = _normalize(source)
+ target = _normalize(target)
+ cos_angle = np.dot(source, target)
+ if cos_angle > 1.0 - 1e-10:
+ return np.eye(4, dtype=np.float64)
+ if cos_angle < -1.0 + 1e-10:
+ axis = _orthogonal_axis(source)
+ return _axis_angle_rotation(axis, np.pi)
+ axis = np.cross(source, target)
+ sin_angle = np.linalg.norm(axis)
+ axis = axis / sin_angle
+ angle = np.arctan2(sin_angle, cos_angle)
+ return _axis_angle_rotation(axis, angle)
+
+
+def _axis_angle_rotation(axis: np.ndarray, angle: float) -> np.ndarray:
+ axis = _normalize(axis)
+ c = np.cos(angle)
+ s = np.sin(angle)
+ t = 1.0 - c
+ x, y, z = axis
+ return np.array(
+ [
+ [t * x * x + c, t * x * y - s * z, t * x * z + s * y, 0.0],
+ [t * x * y + s * z, t * y * y + c, t * y * z - s * x, 0.0],
+ [t * x * z - s * y, t * y * z + s * x, t * z * z + c, 0.0],
+ [0.0, 0.0, 0.0, 1.0],
+ ],
+ dtype=np.float64,
+ )
+
+
+def _orthogonal_axis(vector: np.ndarray) -> np.ndarray:
+ x, y, z = _normalize(vector)
+ if abs(x) < 0.9:
+ return np.array([1.0, 0.0, -x / (z + 1e-10)], dtype=np.float64)
+ return np.array([-y / (x + 1e-10), 1.0, 0.0], dtype=np.float64)
+
+
+def _normalize(vector: np.ndarray) -> np.ndarray:
+ norm = np.linalg.norm(vector)
+ if norm < 1e-12:
+ raise ValueError("Cannot normalise zero-length vector.")
+ return vector / norm
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py
new file mode 100644
index 000000000..9441c6b80
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.manager import (
+ SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+ GravityDropRequest,
+ GravityDropResult,
+)
+
+__all__ = [
+ "GravityDropRequest",
+ "GravityDropResult",
+ "SimulationManager",
+]
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py
new file mode 100644
index 000000000..4a0721103
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/manager.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Simulation manager for gravity-based asset placement."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import torch
+import trimesh
+
+from embodichain.lab.sim.cfg import RigidObjectCfg
+from embodichain.lab.sim.shapes import MeshCfg
+from embodichain.lab.sim.sim_manager import (
+ SimulationManager as _EmbodiSimManager,
+ SimulationManagerCfg,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+ GravityDropRequest,
+ GravityDropResult,
+)
+
+__all__ = ["SimulationManager"]
+
+
+class SimulationManager:
+ """Manager for gravity-based asset placement.
+
+ Wraps an EmbodiChain simulation instance with typed request/response
+ methods, following the same pattern as service clients.
+ """
+
+ def __init__(
+ self,
+ *,
+ headless: bool = True,
+ physics_dt: float = 0.01,
+ sim_device: str = "cpu",
+ ) -> None:
+ """Initialize the simulation manager.
+
+ Args:
+ headless: Whether to run without a GUI.
+ physics_dt: Physics timestep in seconds.
+ sim_device: Device to run the simulation on.
+ """
+ self._headless = headless
+ self._physics_dt = physics_dt
+ self._sim_device = sim_device
+
+ def run_gravity_simulation(
+ self, request: GravityDropRequest
+ ) -> GravityDropResult:
+ """Drop one GLB under gravity and return its final pose."""
+ glb_path = request.glb_path.expanduser().resolve()
+ if not glb_path.is_file():
+ raise FileNotFoundError(f"GLB file not found: {glb_path}")
+
+ initial_height = (
+ float(request.initial_height)
+ if request.initial_height is not None
+ else self._compute_adaptive_drop_height(glb_path)
+ )
+ sim = _EmbodiSimManager(
+ SimulationManagerCfg(
+ headless=self._headless,
+ physics_dt=self._physics_dt,
+ sim_device=self._sim_device,
+ )
+ )
+ obj = sim.add_rigid_object(
+ RigidObjectCfg(
+ uid="dropped_asset",
+ shape=MeshCfg(fpath=str(glb_path)),
+ init_pos=(0.0, 0.0, initial_height),
+ init_rot=(0.0, 0.0, 0.0),
+ body_type="dynamic",
+ max_convex_hull_num=request.max_convex_hull_num,
+ )
+ )
+ sim.update(step=300)
+
+ final_pose = obj.get_local_pose(to_matrix=True)[0].detach().cpu()
+ sim._deferred_destroy()
+ return GravityDropResult(
+ final_pose=np.asarray(final_pose.numpy(), dtype=float),
+ )
+
+ def _compute_adaptive_drop_height(
+ self,
+ glb_path: Path,
+ *,
+ min_clearance: float = 0.2,
+ height_scale: float = 1.25,
+ ) -> float:
+ """Compute an initial drop height from a GLB bounding box."""
+ if min_clearance < 0.0:
+ raise ValueError("min_clearance must be non-negative.")
+ if height_scale <= 0.0:
+ raise ValueError("height_scale must be positive.")
+
+ glb_path = glb_path.expanduser().resolve()
+ loaded = trimesh.load(glb_path, force=None)
+ if isinstance(loaded, trimesh.Scene):
+ bounds = loaded.bounds
+ else:
+ bounds = loaded.bounds
+ height = float(bounds[1][2] - bounds[0][2])
+ return max(height * height_scale, height + min_clearance)
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py
new file mode 100644
index 000000000..c9df4a526
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/managers/simulation_manager/schemas.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+ "GravityDropRequest",
+ "GravityDropResult",
+]
+
+
+@dataclass(frozen=True)
+class GravityDropRequest:
+ """Request to drop a GLB asset under gravity simulation."""
+
+ glb_path: Path
+ max_convex_hull_num: int = 32
+ initial_height: float | None = None
+
+
+@dataclass(frozen=True)
+class GravityDropResult:
+ """Result of dropping a GLB asset under gravity."""
+
+ final_pose: Any
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py
new file mode 100644
index 000000000..e50272eff
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/servers/__init__.py
@@ -0,0 +1,16 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+"""External servers, ignored by git, for testing or demo purposes."""
\ No newline at end of file
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py
new file mode 100644
index 000000000..015c41510
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_layout_alignment.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_layout_alignment.py
new file mode 100644
index 000000000..f6ca1d9dd
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_layout_alignment.py
@@ -0,0 +1,621 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ call_structured_json_model_step,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_up_down_flip_check_messages,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+ GlobalMetricScaleRequest,
+ MetricScaleManager,
+)
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ UP_DOWN_FLIP_CHECK_JSON_SCHEMA,
+)
+
+UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD = 0.6
+UNIFIED_SCENE_STEP = "unified_scene"
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.blender_rendering_manager import (
+ BlenderRenderingManager,
+ RenderObjectScenesRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.matplotlib_manager import (
+ MatplotlibManager,
+ RenderImageComparisonRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+ relative_path,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager import (
+ LayoutManager,
+)
+
+__all__ = ["_export_support_aligned_layout_glbs"]
+
+
+def _export_support_aligned_layout_glbs(
+ *,
+ table: dict[str, Any],
+ objects: list[dict[str, Any]],
+ spatial_relations: list[dict[str, Any]],
+ original_image_path: Path | None,
+ llm: Any | None,
+ output_dir: Path,
+ output_root: Path,
+) -> dict[str, Any]:
+ """Export layout-baked GLBs aligned by support normal and left-right order."""
+ try:
+ import trimesh
+ except ImportError as exc:
+ raise RuntimeError("Support-aligned GLB export requires trimesh.") from exc
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ support_reference_path = _resolve_generated_path(
+ table.get("support_reference_geometry_path") or table.get("raw_geometry_path"),
+ output_root,
+ )
+ object_paths = [
+ (
+ str(item["id"]),
+ _resolve_generated_path(item.get("raw_geometry_path"), output_root),
+ item.get("transform_matrix"),
+ )
+ for item in objects
+ if item.get("raw_geometry_path") and item.get("transform_matrix")
+ ]
+ if not support_reference_path.is_file():
+ raise FileNotFoundError(
+ f"Support reference table GLB not found: {support_reference_path}"
+ )
+ support_reference_transform = GeometryManager.matrix_from_json(
+ table.get("support_reference_transform_matrix")
+ or table.get("transform_matrix"),
+ name="table.support_reference_transform_matrix",
+ )
+ if not object_paths:
+ raise ValueError("No raw object GLBs with transform matrices available.")
+
+ support_reference_scene = trimesh.load(support_reference_path, force="scene")
+ support_reference_scene.apply_transform(support_reference_transform)
+ object_scenes = [
+ (
+ object_id,
+ GeometryManager.load_scene_with_transform(
+ path=path,
+ transform=GeometryManager.matrix_from_json(
+ transform,
+ name=f"{object_id}.transform_matrix",
+ ),
+ trimesh=trimesh,
+ ),
+ )
+ for object_id, path, transform in object_paths
+ ]
+ table_mesh = GeometryManager.scene_to_mesh(support_reference_scene, trimesh=trimesh)
+ support_normal = GeometryManager.estimate_support_normal(table_mesh)
+ normal_alignment = GeometryManager.rotation_between_vectors(
+ support_normal,
+ np.array([0.0, 0.0, 1.0]),
+ )
+
+ for _, scene in object_scenes:
+ scene.apply_transform(normal_alignment)
+
+ object_bounds = [
+ GeometryManager.scene_to_mesh(scene, trimesh=trimesh).bounds
+ for _, scene in object_scenes
+ ]
+ clutter_bounds = np.vstack(
+ [
+ np.vstack([bounds[0] for bounds in object_bounds]).min(axis=0),
+ np.vstack([bounds[1] for bounds in object_bounds]).max(axis=0),
+ ]
+ )
+ clutter_center = 0.5 * (clutter_bounds[0] + clutter_bounds[1])
+ center_transform = np.eye(4, dtype=np.float64)
+ center_transform[:3, 3] = [
+ -float(clutter_center[0]),
+ -float(clutter_center[1]),
+ -float(clutter_center[2]),
+ ]
+
+ for _, scene in object_scenes:
+ scene.apply_transform(center_transform)
+
+ alignment_candidates = _build_up_down_alignment_candidates(
+ object_scenes=object_scenes,
+ support_normal=support_normal,
+ normal_alignment=normal_alignment,
+ spatial_relations=spatial_relations,
+ trimesh=trimesh,
+ )
+ vlm_check_dir = output_dir / "vlm_up_down_flip_check"
+ up_down_flip_check_result = _run_aligned_up_down_flip_vlm_check(
+ llm=llm,
+ original_image_path=original_image_path,
+ normal_object_scenes=alignment_candidates["normal"]["object_scenes"],
+ flipped_object_scenes=alignment_candidates["flipped"]["object_scenes"],
+ output_dir=vlm_check_dir,
+ )
+ selected_variant = str(
+ up_down_flip_check_result.get("selected_variant") or "normal"
+ )
+ if selected_variant not in alignment_candidates:
+ selected_variant = "normal"
+ selected_candidate = alignment_candidates[selected_variant]
+ object_scenes = selected_candidate["object_scenes"]
+ selected_extra_transform = selected_candidate["extra_transform"]
+ apply_up_down_flip = selected_variant == "flipped"
+ complete_table_relative_scale_hint = _complete_table_relative_scale_hint(
+ table=table,
+ support_reference_scene=support_reference_scene,
+ object_scenes=object_scenes,
+ table_alignment_matrix=selected_extra_transform
+ @ center_transform
+ @ normal_alignment,
+ trimesh=trimesh,
+ )
+
+ global_metric_scale = MetricScaleManager.compute_global_from_object_scenes(
+ GlobalMetricScaleRequest(
+ objects=objects,
+ object_scenes=object_scenes,
+ )
+ )
+ metric_scale_transform = GeometryManager.scale_transform(
+ global_metric_scale["scale_factor"]
+ )
+ if float(global_metric_scale["scale_factor"]) != 1.0:
+ for _, scene in object_scenes:
+ scene.apply_transform(metric_scale_transform)
+
+ footprint_result = LayoutManager.settle_and_pack_object_footprints(
+ object_scenes=object_scenes,
+ output_dir=output_dir / "footprint_layout",
+ output_root=output_root,
+ trimesh=trimesh,
+ )
+ object_scenes = footprint_result["object_scenes"]
+
+ output_axis_transform = GeometryManager.z_up_to_glb_y_up_transform()
+ object_outputs = []
+ for object_id, scene in object_scenes:
+ object_output = output_dir / f"{object_id}_aligned.glb"
+ GeometryManager.copy_scene_with_transform(
+ scene,
+ output_axis_transform,
+ ).export(object_output)
+ object_outputs.append(
+ {
+ "id": object_id,
+ "aligned_geometry_path": relative_path(str(object_output), output_root),
+ }
+ )
+
+ alignment_matrix = selected_extra_transform @ center_transform @ normal_alignment
+ scaled_alignment_matrix = metric_scale_transform @ alignment_matrix
+ final_clutter_aabb_2d_cm = LayoutManager.object_scenes_xy_aabb_manifest(
+ object_scenes=object_scenes,
+ trimesh=trimesh,
+ unit_scale=100.0,
+ unit="cm",
+ )
+ return {
+ "status": "ok",
+ "output_dir": relative_path(str(output_dir), output_root),
+ "support_normal": support_normal.tolist(),
+ "clutter_aabb_center_before_centering": clutter_center.tolist(),
+ "alignment_matrix": scaled_alignment_matrix.tolist(),
+ "pre_metric_scale_alignment_matrix": alignment_matrix.tolist(),
+ "global_metric_scale": global_metric_scale,
+ "final_clutter_2d_aabb_cm": final_clutter_aabb_2d_cm,
+ "complete_table_relative_scale_hint": complete_table_relative_scale_hint,
+ "internal_up_axis": [0.0, 0.0, 1.0],
+ "glb_output_up_axis": [0.0, 1.0, 0.0],
+ "glb_output_axis_transform": output_axis_transform.tolist(),
+ "selected_up_down_variant": selected_variant,
+ "applied_up_down_flip": apply_up_down_flip,
+ "selected_extra_transform": selected_extra_transform.tolist(),
+ "object_alignment_matrices": {
+ object_id: (object_transform @ scaled_alignment_matrix).tolist()
+ for object_id, object_transform in footprint_result[
+ "object_layout_transforms"
+ ].items()
+ },
+ "footprint_layout": footprint_result["manifest"],
+ "yaw_sampling": {
+ "sample_count_per_variant": 360,
+ "score_type": "center_left_of_hard_count",
+ "top_view_plane": "XY",
+ "yaw_axis": "Z",
+ "left_right_axis": "X",
+ "front_back_axis": "Y",
+ "front_direction": "+Y",
+ "normal": alignment_candidates["normal"]["yaw_metadata"],
+ "flipped": alignment_candidates["flipped"]["yaw_metadata"],
+ },
+ "up_down_flip_check": up_down_flip_check_result,
+ "objects": object_outputs,
+ }
+
+
+def _complete_table_relative_scale_hint(
+ *,
+ table: dict[str, Any],
+ support_reference_scene: Any,
+ object_scenes: list[tuple[str, Any]],
+ table_alignment_matrix: np.ndarray,
+ trimesh: Any,
+) -> dict[str, Any]:
+ if not table.get("is_complete_visible_table"):
+ return {
+ "status": "skipped",
+ "reason": "table_is_not_complete_visible",
+ }
+ if not object_scenes:
+ return {
+ "status": "skipped",
+ "reason": "missing_object_scenes",
+ }
+ try:
+ table_scene = GeometryManager.copy_scene_with_transform(
+ support_reference_scene,
+ table_alignment_matrix,
+ )
+ raw_clutter_bounds = GeometryManager.table_fit_scene_union_bounds(
+ [scene for _, scene in object_scenes],
+ trimesh=trimesh,
+ )
+ raw_clutter_size_xy = GeometryManager.xy_aabb_size(raw_clutter_bounds)
+ raw_table_mesh = GeometryManager.scene_to_mesh(table_scene, trimesh=trimesh)
+ raw_table_support = GeometryManager.detect_table_fit_support_quad(
+ raw_table_mesh,
+ target_aspect=float(
+ raw_clutter_size_xy[0] / max(float(raw_clutter_size_xy[1]), 1.0e-6)
+ ),
+ )
+ raw_table_support_size_xy = np.asarray(
+ raw_table_support["size_xy"],
+ dtype=np.float64,
+ )
+ ratio_xy = raw_table_support_size_xy / np.maximum(
+ raw_clutter_size_xy,
+ 1.0e-6,
+ )
+ if not np.all(np.isfinite(ratio_xy)) or np.any(ratio_xy <= 0.0):
+ return {
+ "status": "skipped",
+ "reason": "invalid_raw_relative_size",
+ }
+ return {
+ "status": "ok",
+ "method": "complete_table_sam3d_raw_support_to_clutter_ratio",
+ "raw_table_support_size_xy": raw_table_support_size_xy.tolist(),
+ "raw_clutter_size_xy": raw_clutter_size_xy.tolist(),
+ "support_to_clutter_size_ratio_xy": ratio_xy.tolist(),
+ "raw_table_support_quad": raw_table_support,
+ "note": (
+ "Ratio is unitless and is computed before metric scaling; "
+ "table fit later applies one uniform XYZ scale to the simready table."
+ ),
+ }
+ except Exception:
+ return {
+ "status": "failed",
+ "reason": traceback.format_exc(),
+ }
+
+
+def _build_up_down_alignment_candidates(
+ *,
+ object_scenes: list[tuple[str, Any]],
+ support_normal: np.ndarray,
+ normal_alignment: np.ndarray,
+ spatial_relations: list[dict[str, Any]],
+ trimesh: Any,
+) -> dict[str, dict[str, Any]]:
+ flip_transform = GeometryManager.support_normal_flip_transform(
+ support_normal=support_normal,
+ normal_alignment=normal_alignment,
+ )
+ directional_relations = _spatial_directional_relations(spatial_relations)
+ candidates: dict[str, dict[str, Any]] = {}
+ for variant, pre_yaw_transform in [
+ ("normal", np.eye(4, dtype=np.float64)),
+ ("flipped", flip_transform),
+ ]:
+ candidate_object_scenes = [
+ (
+ object_id,
+ GeometryManager.copy_scene_with_transform(scene, pre_yaw_transform),
+ )
+ for object_id, scene in object_scenes
+ ]
+ object_bounds = {
+ object_id: np.asarray(
+ GeometryManager.scene_to_mesh(scene, trimesh=trimesh).bounds,
+ dtype=np.float64,
+ )
+ for object_id, scene in candidate_object_scenes
+ }
+ yaw_metadata = _best_spatial_yaw(
+ object_bounds=object_bounds,
+ relations=directional_relations,
+ )
+ yaw_transform = GeometryManager.z_yaw_transform(
+ float(yaw_metadata["yaw_degrees"]),
+ )
+ for _, scene in candidate_object_scenes:
+ scene.apply_transform(yaw_transform)
+ candidates[variant] = {
+ "object_scenes": candidate_object_scenes,
+ "pre_yaw_transform": pre_yaw_transform,
+ "yaw_transform": yaw_transform,
+ "extra_transform": yaw_transform @ pre_yaw_transform,
+ "yaw_metadata": yaw_metadata,
+ }
+ return candidates
+
+
+def _best_spatial_yaw(
+ *,
+ object_bounds: dict[str, np.ndarray],
+ relations: list[dict[str, str]],
+) -> dict[str, Any]:
+ if not relations:
+ return {
+ "yaw_degrees": 0,
+ "score": 0,
+ "raw_gap_sum": 0.0,
+ "relation_count": 0,
+ "score_type": "center_left_of_hard_count",
+ }
+
+ object_centers = {
+ object_id: GeometryManager.aabb_center(bounds)
+ for object_id, bounds in object_bounds.items()
+ }
+ best_yaw = 0
+ best_score = -1
+ best_raw_gap_sum = float("-inf")
+ best_relation_scores: list[dict[str, Any]] = []
+ for yaw_degrees in range(360):
+ rotation = GeometryManager.z_yaw_transform(float(yaw_degrees))
+ rotated_centers = {
+ object_id: _transform_point(rotation, center)
+ for object_id, center in object_centers.items()
+ }
+ score, raw_gap_sum, relation_scores = _center_left_of_score(
+ centers=rotated_centers,
+ relations=relations,
+ )
+ if score > best_score or (
+ score == best_score and raw_gap_sum > best_raw_gap_sum
+ ):
+ best_yaw = yaw_degrees
+ best_score = score
+ best_raw_gap_sum = raw_gap_sum
+ best_relation_scores = relation_scores
+ return {
+ "yaw_degrees": best_yaw,
+ "score": best_score,
+ "raw_gap_sum": best_raw_gap_sum,
+ "relation_count": len(relations),
+ "score_type": "center_left_of_hard_count",
+ "relation_scores": best_relation_scores,
+ }
+
+
+def _spatial_directional_relations(
+ spatial_relations: list[dict[str, Any]],
+) -> list[dict[str, str]]:
+ relations: list[dict[str, str]] = []
+ seen: set[tuple[str, str, str]] = set()
+ for relation in spatial_relations:
+ subject = str(relation.get("subject") or "")
+ object_id = str(relation.get("object") or "")
+ relation_name = str(relation.get("relation") or "")
+ if (
+ not subject
+ or not object_id
+ or subject == object_id
+ or relation_name != "left_of"
+ ):
+ continue
+ key = (subject, relation_name, object_id)
+ if key in seen:
+ continue
+ seen.add(key)
+ relations.append(
+ {
+ "subject": subject,
+ "relation": relation_name,
+ "object": object_id,
+ }
+ )
+ return relations
+
+
+def _center_left_of_score(
+ centers: dict[str, np.ndarray],
+ relations: list[dict[str, str]],
+) -> tuple[int, float, list[dict[str, Any]]]:
+ score = 0
+ raw_gap_sum = 0.0
+ relation_scores: list[dict[str, Any]] = []
+ for relation in relations:
+ subject = relation["subject"]
+ object_id = relation["object"]
+ if subject not in centers or object_id not in centers:
+ continue
+ subject_center = centers[subject]
+ object_center = centers[object_id]
+ gap = float(object_center[0] - subject_center[0])
+ relation_score = 1 if gap > 0.0 else 0
+ score += relation_score
+ raw_gap_sum += gap
+ relation_scores.append(
+ {
+ "subject": subject,
+ "relation": "left_of",
+ "object": object_id,
+ "gap": gap,
+ "score": relation_score,
+ }
+ )
+ return score, raw_gap_sum, relation_scores
+
+
+def _transform_point(transform: np.ndarray, point: np.ndarray) -> np.ndarray:
+ homogeneous = np.ones(4, dtype=np.float64)
+ homogeneous[:3] = point
+ return (transform @ homogeneous)[:3]
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
+
+
+def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
+ try:
+ import json
+
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(
+ json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
+ encoding="utf-8",
+ )
+ except Exception:
+ pass
+
+
+def _run_aligned_up_down_flip_vlm_check(
+ *,
+ llm: Any | None,
+ original_image_path: Path | None,
+ normal_object_scenes: list[tuple[str, Any]],
+ flipped_object_scenes: list[tuple[str, Any]],
+ output_dir: Path,
+) -> dict[str, Any]:
+ output_dir.mkdir(parents=True, exist_ok=True)
+ result: dict[str, Any] = {
+ "status": "skipped",
+ "applied_up_down_flip": False,
+ "confidence_threshold": UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD,
+ "reason": "",
+ }
+ if not normal_object_scenes or not flipped_object_scenes:
+ result["reason"] = "missing_object_scenes"
+ _write_json_file(output_dir / "up_down_flip_selection.json", result)
+ return result
+
+ try:
+ normal_render_path = output_dir / "normal_object_only_front_oblique_view.png"
+ flipped_render_path = output_dir / "flipped_object_only_front_oblique_view.png"
+ comparison_image_path = output_dir / "numbered_up_down_candidates.png"
+ BlenderRenderingManager().render_object_scenes(
+ RenderObjectScenesRequest(
+ object_scenes=normal_object_scenes,
+ output_path=normal_render_path,
+ )
+ )
+ BlenderRenderingManager().render_object_scenes(
+ RenderObjectScenesRequest(
+ object_scenes=flipped_object_scenes,
+ output_path=flipped_render_path,
+ )
+ )
+ MatplotlibManager(figsize=(12, 6), dpi=180).render_image_comparison(
+ RenderImageComparisonRequest(
+ first_image_path=normal_render_path,
+ second_image_path=flipped_render_path,
+ output_path=comparison_image_path,
+ )
+ )
+ if llm is None:
+ result["reason"] = "missing_llm"
+ _write_json_file(output_dir / "up_down_flip_selection.json", result)
+ return result
+ if original_image_path is None or not original_image_path.is_file():
+ result["reason"] = "missing_original_image"
+ _write_json_file(output_dir / "up_down_flip_selection.json", result)
+ return result
+
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=UP_DOWN_FLIP_CHECK_JSON_SCHEMA,
+ messages=build_up_down_flip_check_messages(
+ original_image_path=original_image_path,
+ comparison_image_path=comparison_image_path,
+ ),
+ context="Unified scene aligned up-down flip check",
+ attempt_count=0,
+ raw_output_writer=lambda payload: _write_json_file(
+ output_dir / "vlm_flip_check_result.json",
+ payload,
+ ),
+ )
+ confidence = float(raw_model_output.get("confidence", 0.0))
+ selected_number = int(raw_model_output.get("selected_number", 1))
+ if selected_number not in {1, 2}:
+ selected_number = 1
+ model_selected_variant = "flipped" if selected_number == 2 else "normal"
+ should_apply = (
+ model_selected_variant == "flipped"
+ and confidence >= UP_DOWN_FLIP_CHECK_CONFIDENCE_THRESHOLD
+ )
+ selected_variant = "flipped" if should_apply else "normal"
+ selected_number = 2 if selected_variant == "flipped" else 1
+ result.update(
+ {
+ "status": "ok",
+ "selected_number": selected_number,
+ "selected_variant": selected_variant,
+ "applied_up_down_flip": should_apply,
+ "model_selected_number": raw_model_output.get("selected_number"),
+ "model_selected_variant": model_selected_variant,
+ "confidence": confidence,
+ "reason": str(raw_model_output.get("reason", "")),
+ }
+ )
+ _write_json_file(output_dir / "up_down_flip_selection.json", result)
+ return result
+ except Exception:
+ result.update(
+ {
+ "status": "failed",
+ "reason": traceback.format_exc(),
+ }
+ )
+ _write_json_file(output_dir / "up_down_flip_selection.json", result)
+ return result
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py
new file mode 100644
index 000000000..d57da062e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_scene_asset_generation.py
@@ -0,0 +1,643 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import shutil
+import traceback
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+ decode_rle_mask,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager import (
+ GeometryGenerationManager,
+ RgbaImageToGeometryRequest,
+ RgbaImagesToGeometriesRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager import (
+ ImageGenerationManager,
+ TextToAssetImageRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager import (
+ AssetImageToRgbaRequest,
+ ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.image_layout_alignment import (
+ _export_support_aligned_layout_glbs,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+ MakeAssetSimreadyRequest,
+ MakeTableSimreadyRequest,
+ SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+ METRIC_SCALE_ENABLED,
+ EstimateMetricScalesRequest,
+ IMAGE_METRIC_SCALE_JSON_SCHEMA,
+ MetricScaleManager,
+ MetricScaleObjectInput,
+ build_image_metric_scale_messages,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.layout_manifests import (
+ _write_multi_object_layout_manifests,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import (
+ relative_path,
+)
+
+__all__ = ["generate_image_scene_assets"]
+
+UNIFIED_SCENE_STEP = "unified_scene"
+
+
+def generate_image_scene_assets(
+ object_specs: list[dict[str, Any]],
+ table_spec: dict[str, Any],
+ spatial_relations: list[dict[str, Any]],
+ segments_data: dict[str, Any],
+ image_gen_dir: Path,
+ glb_gen_dir: Path,
+ debug_dir: Path,
+ output_root: Path,
+ llm: Any | None = None,
+) -> dict[str, Any]:
+ """Run layout-aware table/support and object generation from image masks."""
+ log_info(f"image object layout generation started count={len(object_specs)}")
+ status = "ok"
+ failure_reason = ""
+ original_image_path = str(segments_data.get("image_path", ""))
+ segment_by_id: dict[str, dict[str, Any]] = {
+ str(seg["asset_id"]): seg
+ for seg in segments_data.get("asset_segments", [])
+ if seg.get("asset_id")
+ }
+ table_segment = segments_data.get("table_segment")
+ if not isinstance(table_segment, dict):
+ table_segment = None
+ debug_subdir = debug_dir / "multi_object_masks"
+ masks_dir = debug_subdir / "masks"
+ raw_download_dir = glb_gen_dir / "raw_downloads"
+ simready_dir = glb_gen_dir / "multi_object_layouts_simready"
+ aligned_dir = glb_gen_dir / "multi_object_layouts_aligned"
+ masks_dir.mkdir(parents=True, exist_ok=True)
+ raw_download_dir.mkdir(parents=True, exist_ok=True)
+ simready_dir.mkdir(parents=True, exist_ok=True)
+ aligned_dir.mkdir(parents=True, exist_ok=True)
+
+ requested_items: list[dict[str, Any]] = []
+ mask_paths: list[Path] = []
+
+ table_id = str(table_spec.get("id", "table")).strip() or "table"
+ table_name = str(table_spec.get("name", "table")).strip() or "table"
+ is_complete_visible_table = bool(
+ table_spec.get("is_complete_visible_table", False)
+ )
+ skipped_table: dict[str, Any] | None = None
+ if table_segment is None:
+ skipped_table = {
+ "id": table_id,
+ "name": table_name,
+ "reason": "missing_table_segment",
+ }
+ else:
+ table_mask_rle = table_segment.get("mask_rle")
+ if table_mask_rle is None:
+ skipped_table = {
+ "id": table_id,
+ "name": table_name,
+ "reason": "missing_table_mask_rle",
+ }
+ else:
+ mask_path = masks_dir / f"{len(requested_items):04d}_{table_id}_mask.png"
+ decode_rle_mask(table_mask_rle).save(mask_path)
+ mask_paths.append(mask_path)
+ requested_items.append(
+ {
+ "id": table_id,
+ "name": table_name,
+ "kind": "table",
+ "mask_path": str(mask_path),
+ }
+ )
+
+ for obj_spec in object_specs:
+ obj_id = str(obj_spec.get("id", "")).strip()
+ obj_name = str(obj_spec.get("name", "")).strip()
+ if not obj_id:
+ continue
+ segment = segment_by_id.get(obj_id)
+ if segment is None:
+ continue
+ mask_rle = segment.get("mask_rle")
+ if mask_rle is None:
+ continue
+
+ mask_path = masks_dir / f"{len(requested_items):04d}_{obj_id}_mask.png"
+ decode_rle_mask(mask_rle).save(mask_path)
+ mask_paths.append(mask_path)
+ requested_items.append(
+ {
+ "id": obj_id,
+ "name": obj_name,
+ "description": str(obj_spec.get("description", "")),
+ "kind": "object",
+ "mask_path": str(mask_path),
+ }
+ )
+
+ generated_objects: list[dict[str, Any]] = []
+ generated_table: dict[str, Any] | None = None
+ image_manager = ImageGenerationManager()
+ segmentation_manager = ImageSegmentationManager()
+ geometry_manager = GeometryGenerationManager()
+ simready_manager = SimreadyManager()
+ try:
+ if skipped_table is not None:
+ raise ValueError(
+ "No valid table/support mask found for image multi-object "
+ f"layout generation: {skipped_table['reason']}"
+ )
+ if not mask_paths:
+ raise ValueError(
+ "No valid masks found for image multi-object layout generation."
+ )
+
+ result = geometry_manager.convert_rgba_images_to_geometries(
+ RgbaImagesToGeometriesRequest(
+ image_path=Path(original_image_path),
+ mask_paths=mask_paths,
+ output_dir=raw_download_dir,
+ )
+ )
+ if len(result.objects) != len(requested_items):
+ raise RuntimeError(
+ "Multi-object SAM3D result count mismatch: "
+ f"requested {len(requested_items)}, got {len(result.objects)}"
+ )
+ for requested, generated in zip(requested_items, result.objects):
+ expected_sam3d_name = Path(requested["mask_path"]).stem
+ if generated.name != expected_sam3d_name:
+ raise RuntimeError(
+ "Multi-object SAM3D result order mismatch: "
+ f"expected {expected_sam3d_name!r}, got {generated.name!r}"
+ )
+ downloaded_raw_path = Path(generated.geometry_path).expanduser().resolve()
+ raw_geometry_path = str(downloaded_raw_path)
+ status_parts: list[str] = []
+ transform_matrix: list[list[float]] = []
+ try:
+ transform = GeometryManager.compose_sam3d_multi_object_transform(
+ rotation_quaternion_wxyz=generated.rotation_quaternion_wxyz,
+ translation=generated.translation,
+ scale=generated.scale,
+ )
+ transform_matrix = transform.tolist()
+ except Exception:
+ status_parts.append(
+ f"transform_matrix_failed: {traceback.format_exc()}"
+ )
+
+ simready_geometry_path = ""
+ raw_to_simready_glb_matrix: list[list[float]] = []
+ metric_scale: dict[str, Any] | None = None
+ try:
+ if requested["kind"] == "table":
+ if is_complete_visible_table:
+ table_result = simready_manager.make_table_simready(
+ MakeTableSimreadyRequest(
+ input_path=Path(raw_geometry_path),
+ output_path=simready_dir
+ / f"{requested['id']}_simready.glb",
+ )
+ )
+ simready_geometry_path = str(table_result.output_path)
+ raw_to_simready_glb_matrix = table_result.transform_matrix
+ else:
+ asset_result = simready_manager.make_asset_simready(
+ MakeAssetSimreadyRequest(
+ input_path=Path(raw_geometry_path),
+ output_path=simready_dir
+ / f"{requested['id']}_simready.glb",
+ )
+ )
+ simready_geometry_path = str(asset_result.output_path)
+ raw_to_simready_glb_matrix = asset_result.transform_matrix
+ except Exception:
+ status_parts.append(f"simready_failed: {traceback.format_exc()}")
+ item_status = "ok" if not status_parts else "; ".join(status_parts)
+ generated_item = {
+ "id": requested["id"],
+ "name": requested["name"],
+ "kind": requested["kind"],
+ "description": str(table_spec.get("description", ""))
+ if requested["kind"] == "table"
+ else str(requested.get("description", "")),
+ "complete_table_description": str(
+ table_spec.get("complete_table_description")
+ or table_spec.get("description", "")
+ ).strip()
+ if requested["kind"] == "table"
+ else "",
+ "is_complete_visible_table": is_complete_visible_table
+ if requested["kind"] == "table"
+ else False,
+ "status": item_status,
+ "mask_path": relative_path(requested["mask_path"], output_root),
+ "raw_geometry_path": relative_path(raw_geometry_path, output_root),
+ "simready_geometry_path": relative_path(
+ simready_geometry_path, output_root
+ )
+ if simready_geometry_path
+ else "",
+ "mesh_path": relative_path(simready_geometry_path, output_root)
+ if simready_geometry_path
+ else "",
+ "sam3d_name": generated.name,
+ "downloaded_raw_geometry_path": relative_path(
+ str(downloaded_raw_path), output_root
+ ),
+ "rotation_quaternion_wxyz": generated.rotation_quaternion_wxyz,
+ "translation": generated.translation,
+ "scale": generated.scale,
+ "transform_matrix": transform_matrix,
+ "raw_to_simready_glb_matrix": raw_to_simready_glb_matrix,
+ "metric_scale": metric_scale,
+ }
+ if requested["kind"] == "table":
+ support_reference_path = raw_download_dir / "support_surface_raw.glb"
+ table_raw_path = raw_download_dir / "table_raw.glb"
+ shutil.copy2(downloaded_raw_path, support_reference_path)
+ if is_complete_visible_table:
+ shutil.copy2(downloaded_raw_path, table_raw_path)
+ generated_item["raw_geometry_path"] = relative_path(
+ str(table_raw_path),
+ output_root,
+ )
+ generated_item["support_reference_geometry_path"] = relative_path(
+ str(support_reference_path),
+ output_root,
+ )
+ generated_item["support_reference_transform_matrix"] = transform_matrix
+ generated_item["support_normal_source"] = "segmented_table"
+ generated_item["table_asset_source"] = "segmented_table"
+ if not is_complete_visible_table:
+ # Replace partial image table with description-generated table.
+ incomplete_table_id = str(
+ generated_item.get("id")
+ or table_spec.get("id")
+ or "table"
+ )
+ incomplete_table_desc = str(
+ table_spec.get("complete_table_description")
+ or table_spec.get("description", "")
+ ).strip()
+ incomplete_debug_dir = (
+ debug_dir / incomplete_table_id / "description_generated"
+ )
+ incomplete_debug_dir.mkdir(parents=True, exist_ok=True)
+ incomplete_raw_download_dir = glb_gen_dir / "raw_downloads"
+ incomplete_raw_download_dir.mkdir(parents=True, exist_ok=True)
+ incomplete_raw_image = str(
+ image_manager.generate_asset_image_from_text(
+ TextToAssetImageRequest(
+ prompt=incomplete_table_desc,
+ output_path=incomplete_debug_dir
+ / f"{incomplete_table_id}_complete.png",
+ )
+ )
+ )
+ incomplete_rgba = str(
+ segmentation_manager.convert_asset_image_to_rgba(
+ AssetImageToRgbaRequest(
+ image_path=Path(incomplete_raw_image),
+ prompt=incomplete_table_desc
+ if incomplete_table_desc.strip()
+ else "whole table",
+ output_path=image_gen_dir
+ / f"{incomplete_table_id}_complete.png",
+ )
+ )
+ )
+ incomplete_raw_glb = str(
+ geometry_manager.convert_rgba_image_to_geometry(
+ RgbaImageToGeometryRequest(
+ image_path=Path(incomplete_rgba),
+ output_path=incomplete_debug_dir
+ / f"{incomplete_table_id}_complete_raw.glb",
+ )
+ )
+ )
+ incomplete_table_raw_path = (
+ incomplete_raw_download_dir / "table_raw.glb"
+ )
+ shutil.copy2(incomplete_raw_glb, incomplete_table_raw_path)
+ incomplete_simready = simready_manager.make_table_simready(
+ MakeTableSimreadyRequest(
+ input_path=incomplete_table_raw_path,
+ output_path=glb_gen_dir
+ / "multi_object_layouts_simready"
+ / f"{incomplete_table_id}_simready.glb",
+ )
+ )
+ generated_item.update(
+ {
+ "image_path": relative_path(
+ incomplete_rgba, output_root
+ ),
+ "raw_geometry_path": relative_path(
+ str(incomplete_table_raw_path), output_root
+ ),
+ "generated_table_raw_geometry_path": relative_path(
+ incomplete_raw_glb, output_root
+ ),
+ "simready_geometry_path": relative_path(
+ str(incomplete_simready.output_path),
+ output_root,
+ ),
+ "mesh_path": relative_path(
+ str(incomplete_simready.output_path),
+ output_root,
+ ),
+ "raw_to_simready_glb_matrix": (
+ incomplete_simready.transform_matrix
+ ),
+ "transform_matrix": np.eye(
+ 4, dtype=np.float64
+ ).tolist(),
+ "table_asset_source": "description_generated",
+ "complete_table_description": incomplete_table_desc,
+ }
+ )
+ generated_table = generated_item
+ else:
+ generated_objects.append(generated_item)
+ except Exception as exc:
+ status = "failed"
+ failure_reason = traceback.format_exc()
+ log_warning(f"image object geometry generation failed error={exc}")
+
+ if generated_objects:
+ _estimate_image_scene_metric_scales(
+ objects=generated_objects,
+ bbox_name_image_path=segments_data.get("bbox_name_image_path"),
+ output_dir=glb_gen_dir,
+ output_root=output_root,
+ llm=llm,
+ )
+
+ alignment_result: dict[str, Any] | None = None
+ if generated_table is not None and generated_objects:
+ try:
+ alignment_result = _export_support_aligned_layout_glbs(
+ table=generated_table,
+ objects=generated_objects,
+ spatial_relations=spatial_relations,
+ original_image_path=Path(original_image_path)
+ if original_image_path
+ else None,
+ llm=llm,
+ output_dir=aligned_dir,
+ output_root=output_root,
+ )
+ aligned_object_by_id = {
+ item["id"]: item for item in alignment_result["objects"]
+ }
+ for generated_object in generated_objects:
+ aligned_object = aligned_object_by_id.get(generated_object["id"])
+ if aligned_object is not None:
+ generated_object["aligned_geometry_path"] = aligned_object[
+ "aligned_geometry_path"
+ ]
+ if isinstance(generated_table, dict):
+ generated_table["complete_table_relative_scale_hint"] = (
+ alignment_result.get("complete_table_relative_scale_hint")
+ )
+ except Exception as exc:
+ status = "failed"
+ failure_reason = traceback.format_exc()
+ log_warning(f"image object alignment failed error={exc}")
+ alignment_result = {
+ "status": "failed",
+ "reason": failure_reason,
+ }
+
+ manifest_paths = _write_multi_object_layout_manifests(
+ glb_gen_dir=glb_gen_dir,
+ output_root=output_root,
+ table=generated_table,
+ objects=generated_objects,
+ alignment=alignment_result,
+ )
+ table_fields = (
+ "id",
+ "name",
+ "status",
+ "is_complete_visible_table",
+ "complete_table_description",
+ "object_coverage_percent",
+ "table_asset_source",
+ "support_normal_source",
+ "image_path",
+ "raw_geometry_path",
+ "support_reference_geometry_path",
+ "generated_table_raw_geometry_path",
+ "transformed_geometry_path",
+ "simready_geometry_path",
+ "aligned_geometry_path",
+ "mesh_path",
+ "complete_table_relative_scale_hint",
+ )
+ object_fields = (
+ "id",
+ "name",
+ "description",
+ "status",
+ "image_path",
+ "mesh_path",
+ "aligned_geometry_path",
+ "metric_scale",
+ )
+ workflow_table = (
+ {key: generated_table[key] for key in table_fields if key in generated_table}
+ if generated_table is not None
+ else None
+ )
+ workflow_objects = [
+ {key: item[key] for key in object_fields if key in item}
+ for item in generated_objects
+ ]
+ if workflow_table is not None and workflow_table.get("status") != "ok":
+ workflow_table["status"] = "failed"
+ for item in workflow_objects:
+ if item.get("status") != "ok":
+ item["status"] = "failed"
+ workflow_alignment = (
+ {
+ key: alignment_result[key]
+ for key in (
+ "status",
+ "final_clutter_2d_aabb_cm",
+ "complete_table_relative_scale_hint",
+ )
+ if key in alignment_result
+ }
+ if alignment_result is not None
+ else None
+ )
+ result = {
+ "status": status,
+ "table": workflow_table,
+ "objects": workflow_objects,
+ "alignment": workflow_alignment,
+ "manifests": manifest_paths,
+ }
+ if failure_reason:
+ result["reason"] = failure_reason
+ log_info(
+ "image object layout generation completed "
+ f"status={status} generated={len(generated_objects)}"
+ )
+ return result
+
+
+def _estimate_image_scene_metric_scales(
+ *,
+ objects: list[dict[str, Any]],
+ bbox_name_image_path: Any,
+ output_dir: Path,
+ output_root: Path,
+ llm: Any | None,
+) -> dict[str, Any]:
+ result: dict[str, Any] = {
+ "status": "skipped",
+ "method": "image_scene_bbox_name_vlm_candidate_shape_ratio_median_scale",
+ "bbox_name_image_path": str(bbox_name_image_path or ""),
+ "objects": [],
+ }
+ try:
+ if not METRIC_SCALE_ENABLED:
+ result["reason"] = "metric_scale_disabled"
+ MetricScaleManager.set_for_all_objects(
+ objects=objects,
+ status="skipped",
+ reason="metric_scale_disabled",
+ method=str(result["method"]),
+ )
+ return result
+ if llm is None:
+ result["reason"] = "missing_llm"
+ MetricScaleManager.set_for_all_objects(
+ objects=objects,
+ status="skipped",
+ reason="missing_llm",
+ method=str(result["method"]),
+ )
+ return result
+
+ bbox_image = _resolve_generated_path(bbox_name_image_path, output_root)
+ if not bbox_image.is_file():
+ result["reason"] = "missing_bbox_name_image"
+ MetricScaleManager.set_for_all_objects(
+ objects=objects,
+ status="skipped",
+ reason="missing_bbox_name_image",
+ method=str(result["method"]),
+ )
+ return result
+
+ metric_objects = _build_metric_scale_inputs(
+ objects=objects,
+ output_root=output_root,
+ )
+ result["objects"] = MetricScaleManager.object_prompt_payload(metric_objects)
+ metric_result = MetricScaleManager.estimate_metric_scales(
+ EstimateMetricScalesRequest(
+ objects=metric_objects,
+ messages=build_image_metric_scale_messages(
+ bbox_name_image_path=bbox_image,
+ objects_json=result["objects"],
+ ),
+ schema=IMAGE_METRIC_SCALE_JSON_SCHEMA,
+ llm=llm,
+ context="Image scene metric scale estimate",
+ method=str(result["method"]),
+ step_name=UNIFIED_SCENE_STEP,
+ raw_output_path=output_dir / "image_metric_scale_raw_model_output.json",
+ )
+ )
+ estimates = metric_result.object_scales
+ MetricScaleManager.apply_to_objects(objects=objects, object_scales=estimates)
+ result.update(
+ {
+ "status": "ok",
+ "object_scales": estimates,
+ "unit_note": (
+ "Per-object scale_factor is not baked into simready GLBs. "
+ "Image alignment later computes one clamped global clutter "
+ "scale from these per-object estimates, on top of SAM3D "
+ "per-object layout scale."
+ ),
+ }
+ )
+ except Exception:
+ result.update({"status": "failed", "reason": traceback.format_exc()})
+ MetricScaleManager.set_for_all_objects(
+ objects=objects,
+ status="failed",
+ reason="image_scene_metric_scale_failed",
+ method=str(result["method"]),
+ )
+ return result
+
+
+def _build_metric_scale_inputs(
+ *,
+ objects: list[dict[str, Any]],
+ output_root: Path,
+) -> list[MetricScaleObjectInput]:
+ inputs: list[MetricScaleObjectInput] = []
+ for obj in objects:
+ mesh_path = _resolve_generated_path(
+ obj.get("simready_geometry_path") or obj.get("mesh_path"),
+ output_root,
+ )
+ if not mesh_path.is_file():
+ raise FileNotFoundError(f"Simready object GLB not found: {mesh_path}")
+ inputs.append(
+ MetricScaleObjectInput(
+ object_id=str(obj.get("id", "")),
+ object_name=str(obj.get("name", "")),
+ object_description=str(obj.get("description", "")),
+ mesh_path=mesh_path,
+ )
+ )
+ return inputs
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_segment_filter.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_segment_filter.py
new file mode 100644
index 000000000..62b88bdd8
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/image_segment_filter.py
@@ -0,0 +1,275 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Callable
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+ decode_rle_mask,
+ draw_numbered_masks,
+ sort_segments_by_bbox,
+)
+from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ call_structured_json_model_step,
+ is_model_output_error,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_filter_extra_instances_messages,
+)
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ FILTER_EXTRA_INSTANCES_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.utils import log_api_request_start, log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_attempt_error,
+)
+
+__all__ = [
+ "filter_group_segments_with_vlm",
+ "filter_overlapping_confirmed_segments",
+ "filter_segments_with_vlm",
+ "remove_extra_numbered_segments",
+]
+
+DebugWriter = Callable[[str, str, dict[str, Any]], Path]
+CONFIRMED_MASK_COVERAGE_THRESHOLD = 0.85
+CONFIRMED_MASK_IOU_THRESHOLD = 0.70
+
+
+def remove_extra_numbered_segments(
+ *,
+ segments: list[dict[str, Any]],
+ raw_model_output: dict[str, Any],
+) -> list[dict[str, Any]]:
+ """Remove numbered masks flagged as extra by the VLM."""
+ extra_numbers = raw_model_output.get("extra_instance_numbers")
+ if not isinstance(extra_numbers, list):
+ raise ValueError("extra_instance_numbers must be a list.")
+ extra_indices = {int(number) - 1 for number in extra_numbers}
+ if any(index < 0 or index >= len(segments) for index in extra_indices):
+ raise ValueError("VLM returned an out-of-range extra mask number.")
+ return [
+ segment
+ for index, segment in enumerate(segments)
+ if index not in extra_indices
+ ]
+
+
+def filter_group_segments_with_vlm(
+ *,
+ llm: Any,
+ image_path: Path,
+ step_name: str,
+ group: dict[str, Any],
+ segments: list[dict[str, Any]],
+ stage: str,
+ debug_round_name: str,
+ debug_round_dir: Path,
+ write_debug_json: DebugWriter,
+ confirmed_segments: list[dict[str, Any]] | None = None,
+) -> list[dict[str, Any]]:
+ """Ask VLM to remove wrong or duplicate instances from one SAM3 result.
+
+ All path concerns are injected via *step_name*, *debug_round_name*,
+ *debug_round_dir*, and *write_debug_json* so the tool does not depend
+ on workflow internals.
+ """
+ segments = filter_overlapping_confirmed_segments(
+ segments=segments,
+ confirmed_segments=confirmed_segments or [],
+ )
+ segments = sort_segments_by_bbox(segments)
+ if not segments:
+ return segments
+
+ debug_image_path = draw_numbered_masks(
+ image_path=image_path,
+ segments=segments,
+ output_path=debug_round_dir / "mask.png",
+ )
+ debug_images = list(group.get("debug_images") or [])
+ if str(debug_image_path) not in debug_images:
+ debug_images.append(str(debug_image_path))
+ group["debug_images"] = debug_images
+
+ log_api_request_start(
+ step=step_name,
+ request=f"vlm_filter_{stage}",
+ debug_image=str(debug_image_path),
+ )
+ messages = build_filter_extra_instances_messages(
+ debug_image_path=debug_image_path,
+ name=group["name"],
+ description=group["description"],
+ expected_count=group["expected_count"],
+ class_candidate=group["class_candidate"],
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=FILTER_EXTRA_INSTANCES_JSON_SCHEMA,
+ messages=messages,
+ context=f"Image relation {stage} segmentation filtering",
+
+
+ attempt_count=0,
+ raw_output_writer=lambda payload: write_debug_json(
+ round_name=debug_round_name,
+ filename="raw_model_output.json",
+ payload=payload,
+ ),
+ )
+ return remove_extra_numbered_segments(
+ segments=segments,
+ raw_model_output=raw_model_output,
+ )
+
+
+def filter_segments_with_vlm(
+ *,
+ llm: Any,
+ image_path: Path,
+ step_name: str,
+ segment_groups: list[dict[str, Any]],
+ attempt_count: int,
+ errors: list[str],
+ stage: str,
+ next_debug_round_name: Callable[[str], str],
+ debug_round_dir: Callable[[str], Path],
+ write_debug_json: DebugWriter,
+) -> dict[str, object]:
+ """Filter all segment groups with VLM and return an updated state patch.
+
+ All path concerns are injected via callbacks so the tool does not
+ depend on workflow internals.
+ """
+ result_groups: list[dict[str, Any]] = []
+ current_attempt = attempt_count + 1
+ confirmed_segments: list[dict[str, Any]] = []
+
+ try:
+ for group in segment_groups:
+ group = dict(group)
+ name = str(group.get("name", "unknown"))
+ round_name = next_debug_round_name(f"{stage}_{name}")
+ round_dir = debug_round_dir(round_name)
+ group["segments"] = filter_group_segments_with_vlm(
+ llm=llm,
+ image_path=image_path,
+ step_name=step_name,
+ group=group,
+ segments=group["segments"],
+ stage=stage,
+ debug_round_name=round_name,
+ debug_round_dir=round_dir,
+ write_debug_json=write_debug_json,
+ confirmed_segments=confirmed_segments,
+ )
+ confirmed_segments.extend(group["segments"])
+ result_groups.append(group)
+ except Exception as exc:
+ if is_model_output_error(exc) or isinstance(exc, ValueError):
+ error = format_attempt_error(
+ "Image relations VLM filter", current_attempt, exc
+ )
+ log.log_warning(error)
+ return {
+ "attempt_count": current_attempt,
+ "last_error": error,
+ "errors": errors + [error],
+ }
+ raise
+
+ return {
+ "attempt_count": current_attempt,
+ "segment_groups": result_groups,
+ "last_error": None,
+ }
+
+
+def filter_overlapping_confirmed_segments(
+ *,
+ segments: list[dict[str, Any]],
+ confirmed_segments: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ """Drop candidates that almost exactly duplicate an already accepted mask."""
+ if not segments or not confirmed_segments:
+ return segments
+ filtered: list[dict[str, Any]] = []
+ confirmed_masks = [
+ _segment_mask_array(segment)
+ for segment in confirmed_segments
+ if segment.get("mask_rle") is not None
+ ]
+ confirmed_masks = [mask for mask in confirmed_masks if mask is not None]
+ if not confirmed_masks:
+ return segments
+
+ for segment in segments:
+ candidate_mask = _segment_mask_array(segment)
+ if candidate_mask is None:
+ filtered.append(segment)
+ continue
+ if any(
+ _is_almost_confirmed_duplicate(
+ candidate_mask=candidate_mask,
+ confirmed_mask=confirmed_mask,
+ )
+ for confirmed_mask in confirmed_masks
+ ):
+ continue
+ filtered.append(segment)
+ removed_count = len(segments) - len(filtered)
+ if removed_count:
+ log.log_info(
+ "removed overlapping confirmed segment candidates "
+ f"before VLM count={removed_count}"
+ )
+ return filtered
+
+
+def _segment_mask_array(segment: dict[str, Any]) -> np.ndarray | None:
+ mask_rle = segment.get("mask_rle")
+ if mask_rle is None:
+ return None
+ try:
+ return np.asarray(decode_rle_mask(mask_rle).convert("L"), dtype=np.uint8) > 0
+ except Exception:
+ return None
+
+
+def _is_almost_confirmed_duplicate(
+ *,
+ candidate_mask: np.ndarray,
+ confirmed_mask: np.ndarray,
+) -> bool:
+ if candidate_mask.shape != confirmed_mask.shape:
+ return False
+ candidate_area = int(np.count_nonzero(candidate_mask))
+ confirmed_area = int(np.count_nonzero(confirmed_mask))
+ if candidate_area <= 0 or confirmed_area <= 0:
+ return False
+ intersection = int(np.count_nonzero(candidate_mask & confirmed_mask))
+ union = candidate_area + confirmed_area - intersection
+ candidate_covered = intersection / float(candidate_area)
+ iou = intersection / float(union) if union > 0 else 0.0
+ return (
+ candidate_covered >= CONFIRMED_MASK_COVERAGE_THRESHOLD
+ or iou >= CONFIRMED_MASK_IOU_THRESHOLD
+ )
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/layout_manifests.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/layout_manifests.py
new file mode 100644
index 000000000..6fb9e0c54
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/layout_manifests.py
@@ -0,0 +1,210 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import (
+ relative_path,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+
+__all__ = ["_write_multi_object_layout_manifests"]
+
+
+def _write_multi_object_layout_manifests(
+ *,
+ glb_gen_dir: Path,
+ output_root: Path,
+ table: dict[str, Any] | None,
+ objects: list[dict[str, Any]],
+ alignment: dict[str, Any] | None,
+) -> dict[str, str]:
+ simready_to_aligned_path = glb_gen_dir / "simready_to_aligned_manifest.json"
+
+ write_json(
+ simready_to_aligned_path,
+ _simready_to_aligned_manifest(
+ table=table,
+ items=objects,
+ alignment=alignment,
+ output_root=output_root,
+ ),
+ )
+ return {
+ "simready_to_aligned_manifest_path": relative_path(
+ str(simready_to_aligned_path),
+ output_root,
+ ),
+ }
+
+
+def _simready_to_aligned_manifest(
+ *,
+ table: dict[str, Any] | None,
+ items: list[dict[str, Any]],
+ alignment: dict[str, Any] | None,
+ output_root: Path,
+) -> dict[str, Any]:
+ alignment = alignment or {}
+ alignment_matrix = alignment.get("alignment_matrix", [])
+ glb_output_axis_transform = alignment.get("glb_output_axis_transform", [])
+ object_alignment_matrices = alignment.get("object_alignment_matrices", {})
+ aligned_by_id = _aligned_outputs_by_id(alignment)
+ return {
+ "note": (
+ "Aligned GLBs are generated from raw_downloads plus SAM3D layout "
+ "matrices in memory; simready paths are recorded here as the "
+ "simulation-ready counterpart for each raw GLB."
+ ),
+ "alignment_status": alignment.get("status", ""),
+ "alignment_reason": alignment.get("reason", ""),
+ "selected_up_down_variant": alignment.get("selected_up_down_variant", ""),
+ "applied_up_down_flip": alignment.get("applied_up_down_flip", False),
+ "alignment_matrix": alignment_matrix,
+ "global_metric_scale": alignment.get("global_metric_scale"),
+ "final_clutter_2d_aabb_cm": alignment.get("final_clutter_2d_aabb_cm"),
+ "glb_output_axis_transform": glb_output_axis_transform,
+ "table": (
+ _simready_manifest_table_item(table, output_root=output_root)
+ if table is not None
+ else None
+ ),
+ "items": [
+ _simready_to_aligned_manifest_item(
+ item,
+ aligned_by_id=aligned_by_id,
+ alignment_matrix=alignment_matrix,
+ object_alignment_matrices=object_alignment_matrices,
+ glb_output_axis_transform=glb_output_axis_transform,
+ output_root=output_root,
+ )
+ for item in items
+ ],
+ }
+
+
+def _aligned_outputs_by_id(alignment: dict[str, Any]) -> dict[str, str]:
+ outputs: dict[str, str] = {}
+ for item in alignment.get("objects", []) or []:
+ if isinstance(item, dict) and item.get("id"):
+ outputs[str(item["id"])] = str(item.get("aligned_geometry_path", ""))
+ return outputs
+
+
+def _simready_manifest_table_item(
+ item: dict[str, Any],
+ *,
+ output_root: Path,
+) -> dict[str, Any]:
+ return {
+ "id": item.get("id", ""),
+ "name": item.get("name", ""),
+ "kind": item.get("kind", "table"),
+ "status": item.get("status", ""),
+ "simready_geometry_path": (
+ relative_path(
+ str(
+ _resolve_generated_path(
+ item.get("simready_geometry_path"), output_root
+ )
+ ),
+ output_root,
+ )
+ if item.get("simready_geometry_path")
+ else ""
+ ),
+ "support_reference_geometry_path": (
+ relative_path(
+ str(
+ _resolve_generated_path(
+ item.get("support_reference_geometry_path"),
+ output_root,
+ )
+ ),
+ output_root,
+ )
+ if item.get("support_reference_geometry_path")
+ else ""
+ ),
+ "table_asset_source": item.get("table_asset_source", ""),
+ "support_normal_source": item.get("support_normal_source", ""),
+ "is_complete_visible_table": item.get("is_complete_visible_table", False),
+ "complete_table_description": item.get("complete_table_description", ""),
+ }
+
+
+def _simready_to_aligned_manifest_item(
+ item: dict[str, Any],
+ *,
+ aligned_by_id: dict[str, str],
+ alignment_matrix: Any,
+ object_alignment_matrices: Any,
+ glb_output_axis_transform: Any,
+ output_root: Path,
+) -> dict[str, Any]:
+ item_id = str(item.get("id", ""))
+ sam3d_transform = item.get("transform_matrix", [])
+ item_alignment_matrix = alignment_matrix
+ if isinstance(object_alignment_matrices, dict):
+ item_alignment_matrix = object_alignment_matrices.get(
+ item_id,
+ alignment_matrix,
+ )
+ raw_to_aligned_matrix = GeometryManager.compose_json_matrices(
+ glb_output_axis_transform,
+ item_alignment_matrix,
+ sam3d_transform,
+ )
+ simready_to_aligned_matrix = GeometryManager.compose_simready_to_aligned_matrix(
+ raw_to_aligned_matrix=raw_to_aligned_matrix,
+ raw_to_simready_matrix=item.get("raw_to_simready_glb_matrix", []),
+ )
+ decomposed = GeometryManager.decompose_transform_matrix(simready_to_aligned_matrix)
+ return {
+ "id": item_id,
+ "name": item.get("name", ""),
+ "kind": item.get("kind", ""),
+ "simready_geometry_path": item.get("simready_geometry_path", ""),
+ "aligned_geometry_path": aligned_by_id.get(item_id, ""),
+ "metric_scale": _trim_metric_scale(item.get("metric_scale")),
+ "simready_to_aligned_matrix": simready_to_aligned_matrix,
+ "translation": decomposed["translation"],
+ "rotation_matrix": decomposed["rotation_matrix"],
+ "scale": decomposed["scale"],
+ }
+
+
+def _trim_metric_scale(value: Any) -> dict[str, Any] | None:
+ if not isinstance(value, dict):
+ return None
+ metric_scale = dict(value)
+ for key in ["result_path", "raw_model_output_path"]:
+ metric_scale.pop(key, None)
+ return metric_scale
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/spatial_relations.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/spatial_relations.py
new file mode 100644
index 000000000..b5f938685
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/spatial_relations.py
@@ -0,0 +1,309 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__ = [
+ "GRID_VALUE_LIST",
+ "GRID_VALUES",
+ "RELATION_VALUE_LIST",
+ "RELATION_VALUES",
+ "assign_grids_from_anchor_and_orders",
+ "derive_relations_from_orders",
+ "invert_relation",
+ "normalize_relation",
+ "transitive_relation_closure",
+ "validate_exact_asset_id_coverage",
+]
+
+RELATION_VALUE_LIST = ["left_of", "front_of"]
+RELATION_VALUES = frozenset(RELATION_VALUE_LIST)
+INVERSE_RELATIONS = {
+ "left_of": "right_of",
+ "right_of": "left_of",
+ "front_of": "behind",
+ "behind": "front_of",
+}
+
+GRID_VALUE_LIST = [
+ "center",
+ "front",
+ "back",
+ "left_center",
+ "right_center",
+ "left_front",
+ "right_front",
+ "left_back",
+ "right_back",
+]
+GRID_VALUES = frozenset(GRID_VALUE_LIST)
+
+
+def validate_exact_asset_id_coverage(
+ *,
+ values: list[str],
+ expected_asset_ids: list[str],
+ context: str,
+) -> None:
+ """Validate that values contain every expected asset id exactly once."""
+ expected = set(expected_asset_ids)
+ actual = set(values)
+ duplicates = sorted({asset_id for asset_id in values if values.count(asset_id) > 1})
+ missing = sorted(expected - actual)
+ unknown = sorted(actual - expected)
+ if duplicates:
+ raise ValueError(f"{context} has duplicate asset ids: {duplicates}.")
+ if missing:
+ raise ValueError(f"{context} is missing asset ids: {missing}.")
+ if unknown:
+ raise ValueError(f"{context} has unknown asset ids: {unknown}.")
+
+
+def assign_grids_from_anchor_and_orders(
+ *,
+ anchor_asset_id: str,
+ anchor_grid: str,
+ x_order: list[list[str]],
+ y_order: list[list[str]],
+ asset_ids: list[str],
+) -> dict[str, str]:
+ """Assign 9-grid labels from one anchor grid and two object orderings."""
+ anchor_x, anchor_y = _split_grid(anchor_grid)
+ x_indices = _order_indices(x_order)
+ y_indices = _order_indices(y_order)
+ anchor_x_index = x_indices[anchor_asset_id]
+ anchor_y_index = y_indices[anchor_asset_id]
+
+ grids: dict[str, str] = {}
+ for asset_id in asset_ids:
+ x_label = _axis_label_from_anchor(
+ index=x_indices[asset_id],
+ anchor_index=anchor_x_index,
+ anchor_label=anchor_x,
+ before_label="left",
+ after_label="right",
+ )
+ y_label = _axis_label_from_anchor(
+ index=y_indices[asset_id],
+ anchor_index=anchor_y_index,
+ anchor_label=anchor_y,
+ before_label="front",
+ after_label="back",
+ )
+ grids[asset_id] = _join_grid(x_label=x_label, y_label=y_label)
+ return grids
+
+
+def invert_relation(relation: str) -> str:
+ """Return the inverse of a supported spatial relation."""
+ if relation not in INVERSE_RELATIONS:
+ raise ValueError(f"Unsupported spatial relation: {relation!r}.")
+ return INVERSE_RELATIONS[relation]
+
+
+def normalize_relation(
+ *,
+ subject: str,
+ relation: str,
+ object_id: str,
+) -> tuple[str, str, str]:
+ """Normalize a relation into a canonical directional axis edge."""
+ if relation == "left_of":
+ return subject, "left_of", object_id
+ if relation == "right_of":
+ return object_id, "left_of", subject
+ if relation == "front_of":
+ return subject, "front_of", object_id
+ if relation == "behind":
+ return object_id, "front_of", subject
+ raise ValueError(f"Unsupported spatial relation: {relation!r}.")
+
+
+def transitive_relation_closure(
+ relations: list[dict[str, str]],
+) -> list[dict[str, str]]:
+ """Expand canonical left/front relations with transitive closure."""
+ direct_edges: dict[str, set[tuple[str, str]]] = {
+ "left_of": set(),
+ "front_of": set(),
+ }
+ input_edges: set[tuple[str, str, str]] = set()
+ for relation_record in relations:
+ subject = relation_record["subject"]
+ relation = relation_record["relation"]
+ object_id = relation_record["object"]
+ canonical_subject, canonical_relation, canonical_object = normalize_relation(
+ subject=subject,
+ relation=relation,
+ object_id=object_id,
+ )
+ if canonical_subject == canonical_object:
+ raise ValueError("Spatial relation cannot reference the same object.")
+ edge = (canonical_subject, canonical_object)
+ inverse_edge = (canonical_object, canonical_subject)
+ if inverse_edge in direct_edges[canonical_relation]:
+ raise ValueError(
+ "Conflicting spatial relations: "
+ f"{canonical_subject!r} {canonical_relation} {canonical_object!r}."
+ )
+ direct_edges[canonical_relation].add(edge)
+ input_edges.add((subject, relation, object_id))
+
+ output: list[dict[str, str]] = []
+ seen: set[tuple[str, str, str]] = set()
+ for canonical_relation, edges in direct_edges.items():
+ for subject, object_id in sorted(_transitive_edges(edges)):
+ _append_relation(
+ output=output,
+ seen=seen,
+ subject=subject,
+ relation=canonical_relation,
+ object_id=object_id,
+ source=(
+ "input"
+ if (subject, canonical_relation, object_id) in input_edges
+ else "closure"
+ ),
+ )
+ return output
+
+
+def derive_relations_from_orders(
+ *,
+ x_order: list[list[str]],
+ y_order: list[list[str]],
+) -> list[dict[str, str]]:
+ """Derive canonical relations from adjacent order groups."""
+ relations: list[dict[str, str]] = []
+ relations.extend(_relations_from_order_groups(x_order, relation="left_of"))
+ relations.extend(_relations_from_order_groups(y_order, relation="front_of"))
+ closed = transitive_relation_closure(relations)
+ return [
+ {
+ **relation,
+ "source": "order" if relation["source"] == "input" else relation["source"],
+ }
+ for relation in closed
+ ]
+
+
+def _order_indices(order: list[list[str]]) -> dict[str, int]:
+ return {
+ asset_id: group_index
+ for group_index, group in enumerate(order)
+ for asset_id in group
+ }
+
+
+def _split_grid(grid: str) -> tuple[str, str]:
+ if grid == "center":
+ return "center", "center"
+ if grid in {"front", "back"}:
+ return "center", grid
+ if grid in {"left_center", "right_center"}:
+ return grid.split("_", maxsplit=1)[0], "center"
+ x_label, y_label = grid.split("_", maxsplit=1)
+ return x_label, y_label
+
+
+def _axis_label_from_anchor(
+ *,
+ index: int,
+ anchor_index: int,
+ anchor_label: str,
+ before_label: str,
+ after_label: str,
+) -> str:
+ if index < anchor_index:
+ return before_label
+ if index > anchor_index:
+ return after_label
+ return anchor_label
+
+
+def _join_grid(*, x_label: str, y_label: str) -> str:
+ if x_label == "center" and y_label == "center":
+ return "center"
+ if x_label == "center":
+ return y_label
+ if y_label == "center":
+ return f"{x_label}_center"
+ return f"{x_label}_{y_label}"
+
+
+def _relations_from_order_groups(
+ order_groups: list[list[str]],
+ *,
+ relation: str,
+) -> list[dict[str, str]]:
+ relations: list[dict[str, str]] = []
+ for earlier_group, later_group in zip(order_groups, order_groups[1:]):
+ for subject in earlier_group:
+ for object_id in later_group:
+ relations.append(
+ {
+ "subject": subject,
+ "relation": relation,
+ "object": object_id,
+ "source": "input",
+ }
+ )
+ return relations
+
+
+def _transitive_edges(
+ edges: set[tuple[str, str]],
+) -> set[tuple[str, str]]:
+ adjacency: dict[str, set[str]] = {}
+ for subject, object_id in edges:
+ adjacency.setdefault(subject, set()).add(object_id)
+ adjacency.setdefault(object_id, set())
+
+ closure: set[tuple[str, str]] = set(edges)
+ for start in adjacency:
+ stack = list(adjacency[start])
+ visited: set[str] = set()
+ while stack:
+ current = stack.pop()
+ if current in visited:
+ continue
+ visited.add(current)
+ closure.add((start, current))
+ stack.extend(adjacency.get(current, ()))
+ return closure
+
+
+def _append_relation(
+ *,
+ output: list[dict[str, str]],
+ seen: set[tuple[str, str, str]],
+ subject: str,
+ relation: str,
+ object_id: str,
+ source: str,
+) -> None:
+ key = (subject, relation, object_id)
+ if key in seen:
+ return
+ seen.add(key)
+ output.append(
+ {
+ "subject": subject,
+ "relation": relation,
+ "object": object_id,
+ "source": source,
+ }
+ )
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_clutter_fit.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_clutter_fit.py
new file mode 100644
index 000000000..91e59189f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_clutter_fit.py
@@ -0,0 +1,360 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_manager import (
+ GeometryManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager import (
+ SimulationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simulation_manager.schemas import (
+ GravityDropRequest,
+)
+
+__all__ = ["fit_table_to_clutter"]
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+ if not value:
+ return Path()
+ path = Path(str(value)).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root.expanduser().resolve() / path).resolve()
+
+
+def _gravity_settle_table_fit_internal_z_scene(
+ scene: Any,
+ *,
+ z_to_y: np.ndarray,
+ sim_device: str,
+) -> tuple[Any, np.ndarray]:
+ sim = SimulationManager(headless=True, sim_device=sim_device)
+ with tempfile.TemporaryDirectory(prefix="p2s_table_fit_gravity_") as tmp:
+ tmp_path = Path(tmp)
+ pre_gravity = tmp_path / "table_pre_gravity.glb"
+ GeometryManager.copy_scene_with_transform(scene, z_to_y).export(pre_gravity)
+ result = sim.run_gravity_simulation(
+ GravityDropRequest(
+ glb_path=pre_gravity,
+ max_convex_hull_num=8,
+ initial_height=0.05,
+ )
+ )
+ gravity_transform = np.asarray(result.final_pose, dtype=np.float64)
+ settled = scene.copy()
+ settled.apply_transform(gravity_transform)
+ return settled, gravity_transform
+
+
+def fit_table_to_clutter(
+ *,
+ table_result: dict[str, Any],
+ clutter_result: dict[str, Any],
+ output_root: Path,
+ output_dir: Path,
+ table_output_path: Path | None = None,
+ object_output_paths: dict[str, Path] | None = None,
+ margin_cm: float = 10.0,
+ support_occupancy_ratio: float = 0.80,
+ object_coverage_percent: int | None = None,
+ gravity_settle_table: bool = True,
+ sim_device: str = "cpu",
+) -> dict[str, Any]:
+ """Fit a table mesh to an already laid-out clutter result.
+
+ Args:
+ object_coverage_percent: If set (1-100), overrides
+ ``support_occupancy_ratio`` by converting the percentage to a ratio
+ (e.g. 30 → 0.30). The required table size is computed as
+ clutter_size / ratio. When None, the default
+ ``support_occupancy_ratio`` is used.
+ """
+ try:
+ import trimesh
+ except ImportError as exc:
+ raise RuntimeError("Table fitting requires trimesh.") from exc
+
+ output_root = output_root.expanduser().resolve()
+ output_dir = output_dir.expanduser().resolve()
+ output_dir.mkdir(parents=True, exist_ok=True)
+ if table_output_path is None:
+ table_output_path = output_dir / "table_fit_to_clutter.glb"
+ table_output_path = table_output_path.expanduser().resolve()
+ table_output_path.parent.mkdir(parents=True, exist_ok=True)
+ object_output_paths = {
+ str(key): path.expanduser().resolve()
+ for key, path in (object_output_paths or {}).items()
+ }
+
+ # Resolve the table geometry.
+ table_simready_path = _resolve_generated_path(
+ table_result.get("simready_geometry_path") or table_result.get("mesh_path"),
+ output_root,
+ )
+ if not table_simready_path.is_file():
+ raise FileNotFoundError(f"Table simready GLB not found: {table_simready_path}")
+
+ # Resolve the clutter object geometries.
+ settled_objects = [
+ item
+ for item in clutter_result.get("objects", [])
+ if isinstance(item, dict) and item.get("status") == "ok"
+ ]
+ if not settled_objects:
+ raise ValueError("No successfully settled objects for table fitting.")
+
+ object_glb_paths: list[tuple[str, Path]] = []
+ for item in settled_objects:
+ glb_path = _resolve_generated_path(
+ item.get("laid_out_glb_path") or item.get("settled_glb_path"),
+ output_root,
+ )
+ if glb_path.is_file():
+ object_glb_paths.append((str(item["id"]), glb_path))
+
+ if not object_glb_paths:
+ raise ValueError("No valid settled object GLBs for table fitting.")
+
+ z_to_y = GeometryManager.z_up_to_glb_y_up_transform()
+ y_to_z = np.linalg.inv(z_to_y)
+
+ # Load the table and detect its support surface.
+ table_scene = GeometryManager.load_table_fit_scene_internal_z(
+ table_simready_path,
+ trimesh=trimesh,
+ y_to_z=y_to_z,
+ )
+ table_fit_transform = np.eye(4, dtype=np.float64)
+
+ table_mesh = GeometryManager.scene_to_mesh(table_scene, trimesh=trimesh)
+ clutter_aabb = clutter_result.get("clutter_2d_aabb_cm") or {}
+ clutter_size = clutter_aabb.get("size_xy", [1.0, 1.0])
+ target_aspect = float(clutter_size[0]) / max(float(clutter_size[1]), 1.0e-6)
+ initial_support = GeometryManager.detect_table_fit_support_quad(
+ table_mesh,
+ target_aspect=target_aspect,
+ )
+
+ # Load the clutter scenes.
+ clutter_scenes = [
+ (
+ oid,
+ GeometryManager.load_table_fit_scene_internal_z(
+ path,
+ trimesh=trimesh,
+ y_to_z=y_to_z,
+ ),
+ )
+ for oid, path in object_glb_paths
+ ]
+ clutter_bounds = GeometryManager.table_fit_scene_union_bounds(
+ [scene for _, scene in clutter_scenes],
+ trimesh=trimesh,
+ )
+
+ # Compute the required table size and optional uniform scale.
+ clutter_size_cm = (clutter_bounds[1, :2] - clutter_bounds[0, :2]) * 100.0
+ if object_coverage_percent is not None:
+ support_occupancy_ratio = float(
+ np.clip(object_coverage_percent / 100.0, 0.1, 1.0)
+ )
+ occupancy = float(np.clip(support_occupancy_ratio, 0.1, 1.0))
+ required_size_cm = clutter_size_cm / occupancy + 2.0 * float(margin_cm)
+ scale_method = "fit_to_clutter_occupancy_margin"
+ relative_scale_hint = None
+ if table_result.get("is_complete_visible_table"):
+ hint = table_result.get("complete_table_relative_scale_hint")
+ if isinstance(hint, dict) and hint.get("status") == "ok":
+ ratio_xy = np.asarray(
+ hint.get("support_to_clutter_size_ratio_xy", []),
+ dtype=np.float64,
+ )
+ if ratio_xy.shape == (2,) and np.all(np.isfinite(ratio_xy)):
+ ratio_xy = np.maximum(ratio_xy, 1.0)
+ required_size_cm = clutter_size_cm * ratio_xy
+ scale_method = "complete_table_sam3d_raw_relative_uniform_xyz"
+ relative_scale_hint = hint
+ support_size_cm = np.asarray(initial_support["size_xy"], dtype=np.float64) * 100.0
+ scale_x = GeometryManager.table_fit_safe_positive_ratio(
+ required_size_cm[0],
+ support_size_cm[0],
+ )
+ scale_y = GeometryManager.table_fit_safe_positive_ratio(
+ required_size_cm[1],
+ support_size_cm[1],
+ )
+ uniform_scale = max(scale_x, scale_y)
+ if scale_method == "complete_table_sam3d_raw_relative_uniform_xyz":
+ table_scale_transform = GeometryManager.table_fit_uniform_scale_transform(
+ center_xy=np.asarray(initial_support["center_xy"], dtype=np.float64),
+ scale=uniform_scale,
+ )
+ else:
+ table_scale_transform = GeometryManager.table_fit_uniform_xy_scale_transform(
+ center_xy=np.asarray(initial_support["center_xy"], dtype=np.float64),
+ scale=uniform_scale,
+ )
+ table_scene.apply_transform(table_scale_transform)
+ table_fit_transform = table_scale_transform @ table_fit_transform
+
+ # Settle the table under gravity.
+ if gravity_settle_table:
+ table_scene, gravity_transform = _gravity_settle_table_fit_internal_z_scene(
+ table_scene,
+ z_to_y=z_to_y,
+ sim_device=sim_device,
+ )
+ table_fit_transform = gravity_transform @ table_fit_transform
+
+ # Reposition the table at the origin.
+ final_table_mesh = GeometryManager.scene_to_mesh(table_scene, trimesh=trimesh)
+ final_support = GeometryManager.detect_table_fit_support_quad(
+ final_table_mesh,
+ target_aspect=float(required_size_cm[0] / max(required_size_cm[1], 1.0e-6)),
+ )
+ support_center = np.asarray(final_support["center"], dtype=np.float64)
+ table_bounds = np.asarray(final_table_mesh.bounds, dtype=np.float64)
+ table_bottom_z = float(table_bounds[0, 2])
+
+ table_shift = np.eye(4, dtype=np.float64)
+ table_shift[:3, 3] = [-support_center[0], -support_center[1], -table_bottom_z]
+ table_scene.apply_transform(table_shift)
+ table_fit_transform = table_shift @ table_fit_transform
+ support_z_after = float((support_center + table_shift[:3, 3])[2])
+
+ # Measure the table surface height.
+ # Use the highest point of the table mesh (after scaling + gravity + shift)
+ # rather than the support-plane mean Z, so that thin objects sit above the
+ # actual geometry even when the tabletop has slight unevenness.
+ _table_mesh_after_shift = GeometryManager.scene_to_mesh(
+ table_scene,
+ trimesh=trimesh,
+ )
+ _table_max_z = float(
+ np.asarray(_table_mesh_after_shift.bounds, dtype=np.float64)[1, 2]
+ )
+ _surface_z_margin = 0.01 # 1 cm above the highest table point
+
+ # Place the objects on the table.
+ placed_objects: list[dict[str, Any]] = []
+ shifted_clutter: list[tuple[str, Any]] = []
+ clutter_after = GeometryManager.table_fit_scene_union_bounds(
+ [scene for _, scene in clutter_scenes],
+ trimesh=trimesh,
+ )
+ clutter_center_xy = 0.5 * (clutter_after[0, :2] + clutter_after[1, :2])
+ for oid, scene in clutter_scenes:
+ obj_mesh = GeometryManager.scene_to_mesh(scene, trimesh=trimesh)
+ obj_bounds = np.asarray(obj_mesh.bounds, dtype=np.float64)
+ obj_bottom_z = float(obj_bounds[0, 2])
+ obj_shift = np.eye(4, dtype=np.float64)
+ obj_shift[:3, 3] = [
+ -float(clutter_center_xy[0]),
+ -float(clutter_center_xy[1]),
+ _table_max_z - obj_bottom_z + _surface_z_margin,
+ ]
+ scene.apply_transform(obj_shift)
+ shifted_clutter.append((oid, scene))
+
+ # Export the fitted table and placed objects.
+ GeometryManager.copy_scene_with_transform(table_scene, z_to_y).export(
+ table_output_path
+ )
+
+ for oid, scene in shifted_clutter:
+ object_path = object_output_paths.get(oid, output_dir / f"{oid}_on_table.glb")
+ object_path.parent.mkdir(parents=True, exist_ok=True)
+ GeometryManager.copy_scene_with_transform(scene, z_to_y).export(object_path)
+ # Compute world-space AABB bottom-centre (sim Z-up coords) before
+ # the scene is converted to GLB Y-up for export. This is the
+ # reference position that gym_export uses to derive ``init_pos``.
+ _placed_mesh = GeometryManager.scene_to_mesh(scene, trimesh=trimesh)
+ _placed_b = np.asarray(_placed_mesh.bounds, dtype=np.float64)
+ world_aabb_bottom_center = [
+ float(0.5 * (_placed_b[0, 0] + _placed_b[1, 0])),
+ float(0.5 * (_placed_b[0, 1] + _placed_b[1, 1])),
+ float(_placed_b[0, 2]),
+ ]
+ placed_objects.append(
+ {
+ "id": oid,
+ "path": str(object_path),
+ "world_aabb_bottom_center": world_aabb_bottom_center,
+ }
+ )
+
+ final_clutter_bounds = GeometryManager.table_fit_scene_union_bounds(
+ [scene for _, scene in shifted_clutter],
+ trimesh=trimesh,
+ )
+ final_clutter_aabb_cm = GeometryManager.table_fit_bounds_xy_manifest(
+ final_clutter_bounds,
+ unit_scale=100.0,
+ )
+ final_support_centered = {
+ **final_support,
+ "center": (support_center + table_shift[:3, 3]).tolist(),
+ "center_xy": (
+ np.asarray(final_support["center_xy"], dtype=np.float64)
+ - support_center[:2]
+ ).tolist(),
+ "corners_xy": (
+ np.asarray(final_support["corners_xy"], dtype=np.float64)
+ - support_center[:2]
+ ).tolist(),
+ }
+ manifest = {
+ "status": "ok",
+ "output_dir": str(output_dir),
+ "table_simready_path": str(table_simready_path),
+ "table_output_path": str(table_output_path),
+ "objects": placed_objects,
+ "margin_cm": margin_cm,
+ "support_occupancy_ratio": occupancy,
+ "gravity_settle_table": gravity_settle_table,
+ "table_bottom_z_after_shift": 0.0,
+ "support_z_after_shift": support_z_after,
+ "table_fit_transform": table_fit_transform.tolist(),
+ "initial_support_quad": initial_support,
+ "final_support_quad_centered": final_support_centered,
+ "clutter_2d_aabb_cm": final_clutter_aabb_cm,
+ "required_support_size_cm": required_size_cm.tolist(),
+ "table_xy_scale": {
+ "method": scale_method,
+ "uniform_scale": uniform_scale,
+ "scale_x_raw": scale_x,
+ "scale_y_raw": scale_y,
+ "support_size_before_scale_cm": support_size_cm.tolist(),
+ "complete_table_relative_scale_hint": relative_scale_hint,
+ },
+ "fit_check": {
+ "fits_width": float(final_clutter_aabb_cm["size_xy"][0])
+ <= float(np.asarray(final_support_centered["size_xy"])[0] * 100.0),
+ "fits_depth": float(final_clutter_aabb_cm["size_xy"][1])
+ <= float(np.asarray(final_support_centered["size_xy"])[1] * 100.0),
+ },
+ }
+ return manifest
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py
new file mode 100644
index 000000000..d36cb718f
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/table_fit_scene.py
@@ -0,0 +1,158 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.table_clutter_fit import (
+ fit_table_to_clutter,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import relative_path, write_json
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = ["fit_image_scene_table", "fit_text_scene_table"]
+
+
+def _table_fit_output_paths(
+ *,
+ output_dir: Path,
+ object_ids: list[str],
+) -> tuple[Path, dict[str, Path]]:
+ return (
+ output_dir / "table_fit_to_clutter.glb",
+ {
+ object_id: output_dir / f"{object_id}_on_table.glb"
+ for object_id in object_ids
+ },
+ )
+
+
+def fit_text_scene_table(
+ *,
+ table_result: dict[str, Any],
+ clutter_layout_result: dict[str, Any],
+ output_root: Path,
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Fit the text-scene table and convert failures to result data."""
+ try:
+ object_ids = [
+ str(item["id"])
+ for item in clutter_layout_result.get("objects", [])
+ if isinstance(item, dict) and item.get("id") and item.get("status") == "ok"
+ ]
+ table_output_path, object_output_paths = _table_fit_output_paths(
+ output_dir=output_dir,
+ object_ids=object_ids,
+ )
+ manifest = fit_table_to_clutter(
+ table_result=table_result,
+ clutter_result=clutter_layout_result,
+ output_root=output_root,
+ output_dir=output_dir,
+ table_output_path=table_output_path,
+ object_output_paths=object_output_paths,
+ object_coverage_percent=table_result.get("object_coverage_percent"),
+ )
+ manifest_path = output_dir / "table_fit_to_clutter_manifest.json"
+ write_json(manifest_path, manifest)
+ result = {
+ "status": "ok",
+ "manifest_path": relative_path(str(manifest_path), output_root),
+ }
+ log_info(f"text table fit completed status={result.get('status')}")
+ return result
+ except Exception as exc:
+ log_warning(f"text table fit failed error={exc}")
+ return {
+ "status": "failed",
+ "reason": traceback.format_exc(),
+ }
+
+
+def fit_image_scene_table(
+ *,
+ layout_result: dict[str, Any],
+ fallback_table_result: dict[str, Any] | None,
+ output_root: Path,
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Fit the image-scene table or return a structured skipped result."""
+ generated_table = layout_result.get("table") or fallback_table_result
+ generated_objects = layout_result.get("objects") or []
+ alignment_result = layout_result.get("alignment")
+ if (
+ generated_table is None
+ or not generated_objects
+ or not isinstance(alignment_result, dict)
+ ):
+ return {
+ "status": "skipped",
+ "reason": "missing_table_objects_or_alignment",
+ }
+
+ try:
+ object_ids = [
+ str(item["id"])
+ for item in generated_objects
+ if isinstance(item, dict)
+ and item.get("id")
+ and item.get("aligned_geometry_path")
+ ]
+ table_output_path, object_output_paths = _table_fit_output_paths(
+ output_dir=output_dir,
+ object_ids=object_ids,
+ )
+ clutter_result = {
+ "clutter_2d_aabb_cm": alignment_result.get(
+ "final_clutter_2d_aabb_cm"
+ ),
+ "objects": [
+ {
+ "id": item["id"],
+ "status": "ok",
+ "laid_out_glb_path": item["aligned_geometry_path"],
+ }
+ for item in generated_objects
+ if item.get("id") and item.get("aligned_geometry_path")
+ ],
+ }
+ manifest = fit_table_to_clutter(
+ table_result=generated_table,
+ clutter_result=clutter_result,
+ output_root=output_root,
+ output_dir=output_dir,
+ table_output_path=table_output_path,
+ object_output_paths=object_output_paths,
+ object_coverage_percent=generated_table.get("object_coverage_percent"),
+ )
+ manifest_path = output_dir / "table_fit_to_clutter_manifest.json"
+ write_json(manifest_path, manifest)
+ result = {
+ "status": "ok",
+ "manifest_path": relative_path(str(manifest_path), output_root),
+ }
+ log_info(f"image table fit completed status={result.get('status')}")
+ return result
+ except Exception as exc:
+ log_warning(f"image table fit failed error={exc}")
+ return {
+ "status": "failed",
+ "reason": traceback.format_exc(),
+ }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py
new file mode 100644
index 000000000..ada7ad789
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_asset_generation.py
@@ -0,0 +1,296 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import shutil
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.geometry_generation_manager import (
+ GeometryGenerationManager,
+ RgbaImageToGeometryRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_generation_manager import (
+ ImageGenerationManager,
+ TextToAssetImageRequest,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.image_segmentation_manager import (
+ AssetImageToRgbaRequest,
+ ImageSegmentationManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+ MakeAssetSimreadyRequest,
+ MakeTableSimreadyRequest,
+ SimreadyManager,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = [
+ "generate_text_object_asset",
+ "generate_text_object_assets",
+ "generate_text_table_asset",
+]
+
+
+def generate_text_object_asset(
+ *,
+ object_spec: dict[str, Any],
+ image_gen_dir: Path,
+ glb_gen_dir: Path,
+ debug_dir: Path,
+) -> dict[str, Any]:
+ """Generate one object asset from a text-origin object spec."""
+ object_id = str(object_spec.get("id", "object"))
+ object_name = str(object_spec.get("name", ""))
+ description = str(object_spec.get("description", ""))
+ class_candidates = [
+ str(candidate).replace("_", " ")
+ for candidate in object_spec.get("class_candidate", [])
+ if isinstance(candidate, str) and candidate.strip()
+ ]
+ status = "ok"
+ image_path = ""
+ raw_geometry_path = ""
+ mesh_path = ""
+ raw_to_simready_matrix: list[list[float]] = []
+
+ debug_subdir = debug_dir / object_id
+ debug_subdir.mkdir(parents=True, exist_ok=True)
+ log_info(f"text object generation started id={object_id} name={object_name}")
+
+ image_manager = ImageGenerationManager()
+ segmentation_manager = ImageSegmentationManager()
+ geometry_manager = GeometryGenerationManager()
+ simready_manager = SimreadyManager()
+
+ try:
+ image_prompt = f"{object_name}, {description}".strip(", ")
+ raw_image_path = str(
+ image_manager.generate_asset_image_from_text(
+ TextToAssetImageRequest(
+ prompt=image_prompt,
+ output_path=debug_subdir / f"{object_id}.png",
+ )
+ )
+ )
+
+ rgba_prompts: list[str] = []
+ if description.strip():
+ rgba_prompts.append(description.strip())
+ for candidate in class_candidates:
+ candidate_prompt = f"The entire {candidate} on the center of the image"
+ if candidate_prompt not in rgba_prompts:
+ rgba_prompts.append(candidate_prompt)
+ if not rgba_prompts:
+ rgba_prompts.append(
+ f"the entire single isolated object {object_name}"
+ if object_name
+ else "the entire single isolated object"
+ )
+
+ rgba_path = ""
+ last_rgba_error: Exception | None = None
+ for prompt in rgba_prompts:
+ try:
+ rgba_path = str(
+ segmentation_manager.convert_asset_image_to_rgba(
+ AssetImageToRgbaRequest(
+ image_path=Path(raw_image_path),
+ prompt=prompt,
+ output_path=image_gen_dir / f"{object_id}.png",
+ )
+ )
+ )
+ break
+ except Exception as exc:
+ last_rgba_error = exc
+ log_warning(
+ "text object segmentation prompt failed "
+ f"id={object_id} prompt={prompt!r} error={exc}"
+ )
+ if not rgba_path:
+ raise last_rgba_error or RuntimeError(
+ f"No RGBA prompt succeeded for {object_id}"
+ )
+
+ raw_glb_path = str(
+ geometry_manager.convert_rgba_image_to_geometry(
+ RgbaImageToGeometryRequest(
+ image_path=Path(rgba_path),
+ output_path=debug_subdir / f"{object_id}_raw.glb",
+ )
+ )
+ )
+ raw_geometry_dir = glb_gen_dir / "raw_downloads"
+ raw_geometry_dir.mkdir(parents=True, exist_ok=True)
+ object_raw_path = raw_geometry_dir / f"{object_id}_raw.glb"
+ shutil.copy2(raw_glb_path, object_raw_path)
+ raw_geometry_path = str(object_raw_path)
+
+ simready_result = simready_manager.make_asset_simready(
+ MakeAssetSimreadyRequest(
+ input_path=Path(raw_glb_path),
+ output_path=glb_gen_dir
+ / "text_objects_simready"
+ / f"{object_id}_simready.glb",
+ )
+ )
+ mesh_path = str(simready_result.output_path)
+ raw_to_simready_matrix = simready_result.transform_matrix
+
+ image_path = rgba_path
+ log_info(f"text object generation completed id={object_id} mesh={mesh_path}")
+ except Exception as exc:
+ status = f"failed: {traceback.format_exc()}"
+ log_warning(f"text object generation failed id={object_id} error={exc}")
+
+ return {
+ "id": object_id,
+ "name": object_name,
+ "description": description,
+ "status": status,
+ "image_path": image_path,
+ "raw_geometry_path": raw_geometry_path,
+ "mesh_path": mesh_path,
+ "simready_geometry_path": mesh_path,
+ "raw_to_simready_glb_matrix": raw_to_simready_matrix,
+ "metric_scale": None,
+ }
+
+
+def generate_text_object_assets(
+ *,
+ object_specs: list[dict[str, Any]],
+ image_gen_dir: Path,
+ glb_gen_dir: Path,
+ debug_dir: Path,
+) -> list[dict[str, Any]]:
+ """Generate all object assets for a text-origin unified scene."""
+ log_info(f"text object batch generation started count={len(object_specs)}")
+ results = [
+ generate_text_object_asset(
+ object_spec=object_spec,
+ image_gen_dir=image_gen_dir,
+ glb_gen_dir=glb_gen_dir,
+ debug_dir=debug_dir,
+ )
+ for object_spec in object_specs
+ ]
+ succeeded = sum(result.get("status") == "ok" for result in results)
+ log_info(
+ f"text object batch generation completed "
+ f"succeeded={succeeded} failed={len(results) - succeeded}"
+ )
+ return results
+
+
+def generate_text_table_asset(
+ *,
+ table_spec: dict[str, Any],
+ image_gen_dir: Path,
+ glb_gen_dir: Path,
+ debug_dir: Path,
+) -> dict[str, Any]:
+ """Generate the table asset for a text-origin unified scene."""
+ table_id = str(table_spec.get("id", "table"))
+ description = str(
+ table_spec.get("complete_table_description")
+ or table_spec.get("description", "")
+ ).strip()
+ status = "ok"
+ image_path = ""
+ raw_geometry_path = ""
+ generated_table_raw_geometry_path = ""
+ mesh_path = ""
+
+ debug_subdir = debug_dir / table_id
+ debug_subdir.mkdir(parents=True, exist_ok=True)
+ log_info(f"text table generation started id={table_id}")
+
+ image_manager = ImageGenerationManager()
+ segmentation_manager = ImageSegmentationManager()
+ geometry_manager = GeometryGenerationManager()
+ simready_manager = SimreadyManager()
+
+ try:
+ raw_image_path = str(
+ image_manager.generate_asset_image_from_text(
+ TextToAssetImageRequest(
+ prompt=description,
+ output_path=debug_subdir / f"{table_id}.png",
+ )
+ )
+ )
+ rgba_path = str(
+ segmentation_manager.convert_asset_image_to_rgba(
+ AssetImageToRgbaRequest(
+ image_path=Path(raw_image_path),
+ prompt=description if description.strip() else "whole table",
+ output_path=image_gen_dir / f"{table_id}.png",
+ )
+ )
+ )
+ raw_glb_path = str(
+ geometry_manager.convert_rgba_image_to_geometry(
+ RgbaImageToGeometryRequest(
+ image_path=Path(rgba_path),
+ output_path=debug_subdir / f"{table_id}_raw.glb",
+ )
+ )
+ )
+ generated_table_raw_geometry_path = raw_glb_path
+ raw_geometry_dir = glb_gen_dir / "raw_downloads"
+ raw_geometry_dir.mkdir(parents=True, exist_ok=True)
+ table_raw_path = raw_geometry_dir / "table_raw.glb"
+ shutil.copy2(raw_glb_path, table_raw_path)
+ raw_geometry_path = str(table_raw_path)
+ mesh_path = str(
+ simready_manager.make_table_simready(
+ MakeTableSimreadyRequest(
+ input_path=Path(raw_geometry_path),
+ output_path=glb_gen_dir
+ / "text_objects_simready"
+ / f"{table_id}_simready.glb",
+ )
+ ).output_path
+ )
+ image_path = rgba_path
+ log_info(f"text table generation completed id={table_id} mesh={mesh_path}")
+ except Exception as exc:
+ status = f"failed: {traceback.format_exc()}"
+ log_warning(f"text table generation failed id={table_id} error={exc}")
+
+ return {
+ "id": table_id,
+ "name": str(table_spec.get("name", "table")),
+ "description": str(table_spec.get("description", "")),
+ "complete_table_description": description,
+ "is_complete_visible_table": bool(
+ table_spec.get("is_complete_visible_table", False)
+ ),
+ "object_coverage_percent": table_spec.get("object_coverage_percent"),
+ "status": status,
+ "image_path": image_path,
+ "raw_geometry_path": raw_geometry_path,
+ "generated_table_raw_geometry_path": generated_table_raw_geometry_path,
+ "support_reference_geometry_path": "",
+ "table_asset_source": "description_generated",
+ "support_normal_source": "",
+ "mesh_path": mesh_path,
+ "simready_geometry_path": mesh_path,
+ }
diff --git a/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py
new file mode 100644
index 000000000..b0ceb0cae
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/agent_tools/tools/text_scene_metric_scale.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import traceback
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.simready_manager import (
+ METRIC_SCALE_ENABLED,
+ EstimateMetricScalesRequest,
+ MetricScaleManager,
+ MetricScaleObjectInput,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.utils.log import log_info, log_warning
+
+__all__ = ["build_metric_scale_inputs", "estimate_text_scene_metric_scale"]
+
+
+def estimate_text_scene_metric_scale(
+ *,
+ object_results: list[dict[str, Any]],
+ user_text: str,
+ messages: list[dict[str, Any]],
+ schema: dict[str, Any],
+ output_dir: Path,
+ output_root: Path,
+ llm: Any | None,
+ step_name: str,
+) -> dict[str, Any]:
+ """Estimate real-world scales for generated text-scene objects."""
+ result: dict[str, Any] = {
+ "status": "skipped",
+ "method": "text_scene_vlm_candidate_shape_ratio_median_scale",
+ "user_text": user_text,
+ "objects": [],
+ }
+ try:
+ if not object_results:
+ result["reason"] = "missing_objects"
+ log_warning("text scene metric scale skipped reason=missing_objects")
+ return result
+ if not METRIC_SCALE_ENABLED:
+ result["reason"] = "metric_scale_disabled"
+ MetricScaleManager.set_for_all_objects(
+ objects=object_results,
+ status="skipped",
+ reason="metric_scale_disabled",
+ method=str(result["method"]),
+ )
+ log_info("text scene metric scale skipped reason=metric_scale_disabled")
+ return result
+ if llm is None:
+ result["reason"] = "missing_llm"
+ MetricScaleManager.set_for_all_objects(
+ objects=object_results,
+ status="skipped",
+ reason="missing_llm",
+ method=str(result["method"]),
+ )
+ log_warning("text scene metric scale skipped reason=missing_llm")
+ return result
+
+ log_info(f"text scene metric scale started count={len(object_results)}")
+ metric_objects = build_metric_scale_inputs(
+ objects=object_results,
+ output_root=output_root,
+ )
+ result["objects"] = MetricScaleManager.object_prompt_payload(metric_objects)
+ metric_result = MetricScaleManager.estimate_metric_scales(
+ EstimateMetricScalesRequest(
+ objects=metric_objects,
+ messages=messages,
+ schema=schema,
+ llm=llm,
+ context="Text scene metric scale estimate",
+ method=str(result["method"]),
+ step_name=step_name,
+ raw_output_path=output_dir / "raw_model_output.json",
+ )
+ )
+ raw_model_output = metric_result.raw_model_output or {}
+ if not (output_dir / "raw_model_output.json").is_file():
+ try:
+ write_json(output_dir / "raw_model_output.json", raw_model_output)
+ except Exception as exc:
+ log_warning(f"metric scale raw output write failed error={exc}")
+
+ estimates = metric_result.object_scales
+ MetricScaleManager.apply_to_objects(
+ objects=object_results,
+ object_scales=estimates,
+ )
+ result.update(
+ {
+ "status": "ok",
+ "object_scales": estimates,
+ "unit_note": (
+ "Per-object scale_factor is not baked into simready GLBs. "
+ "For text input, simready_geometry_path multiplied by this "
+ "scale_factor gives the estimated real-world size."
+ ),
+ }
+ )
+ log_info(f"text scene metric scale completed count={len(estimates)}")
+ except Exception as exc:
+ result.update({"status": "failed", "reason": traceback.format_exc()})
+ MetricScaleManager.set_for_all_objects(
+ objects=object_results,
+ status="failed",
+ reason="text_scene_metric_scale_failed",
+ method=str(result["method"]),
+ )
+ log_warning(f"text scene metric scale failed error={exc}")
+ return result
+
+
+def build_metric_scale_inputs(
+ *,
+ objects: list[dict[str, Any]],
+ output_root: Path,
+) -> list[MetricScaleObjectInput]:
+ inputs: list[MetricScaleObjectInput] = []
+ for obj in objects:
+ mesh_path = _resolve_generated_path(
+ obj.get("simready_geometry_path") or obj.get("mesh_path"),
+ output_root,
+ )
+ if not mesh_path.is_file():
+ raise FileNotFoundError(f"Simready object GLB not found: {mesh_path}")
+ inputs.append(
+ MetricScaleObjectInput(
+ object_id=str(obj.get("id", "")),
+ object_name=str(obj.get("name", "")),
+ object_description=str(obj.get("description", "")),
+ mesh_path=mesh_path,
+ )
+ )
+ return inputs
+
+
+def _resolve_generated_path(value: Any, output_root: Path) -> Path:
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
diff --git a/embodichain/gen_sim/prompt2scene/cli/__init__.py b/embodichain/gen_sim/prompt2scene/cli/__init__.py
new file mode 100644
index 000000000..015c41510
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/cli/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/cli/start.py b/embodichain/gen_sim/prompt2scene/cli/start.py
new file mode 100644
index 000000000..22e9e7e44
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/cli/start.py
@@ -0,0 +1,99 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.pipeline.runner import run_prompt2scene
+from embodichain.gen_sim.prompt2scene.llms import load_llm_config
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["cli_prompt2scene", "main"]
+
+
+def cli_prompt2scene(
+ image_path: str | None,
+ prompt: str | None,
+ output_root: str,
+ llm_config_path: str | None = None,
+) -> None:
+ """Run prompt2scene from normalized CLI argument values.
+
+ Args:
+ image_path: Path to an input image, if image mode is used.
+ prompt: Optional edit prompt.
+ output_root: Directory where prompt2scene outputs are written.
+ llm_config_path: Optional path to the LLM config JSON file.
+ """
+ request = Prompt2SceneInput.from_cli_args(
+ image_path=Path(image_path) if image_path is not None else None,
+ prompt=prompt,
+ output_root=Path(output_root),
+ )
+ llm_cfg = load_llm_config(
+ Path(llm_config_path) if llm_config_path is not None else None
+ )
+ run_prompt2scene(request, llm_cfg=llm_cfg)
+
+
+def main() -> None:
+ """Parse command line arguments and launch the prompt2scene pipeline."""
+ parser = argparse.ArgumentParser(
+ description="embodichain.gen_sim.prompt2scene Prompt-to-Scene Pipeline"
+ )
+
+ input_group = parser.add_mutually_exclusive_group(required=False)
+ input_group.add_argument(
+ "--image",
+ type=str,
+ help="Path to the input image file (.jpg, .jpeg, or .png)",
+ )
+ parser.add_argument(
+ "--prompt",
+ type=str,
+ default=None,
+ help=(
+ "Optional edit instruction. Use with --image to edit after "
+ "generation, or with only --output_root to edit an existing scene."
+ ),
+ )
+ parser.add_argument(
+ "--output_root",
+ type=str,
+ required=True,
+ help="Path to the output directory",
+ )
+ parser.add_argument(
+ "--llm_config",
+ type=str,
+ default=None,
+ help="Path to the LLM config JSON file",
+ )
+
+ args = parser.parse_args()
+
+ cli_prompt2scene(
+ args.image,
+ args.prompt,
+ args.output_root,
+ args.llm_config,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/embodichain/gen_sim/prompt2scene/configs/client_config.json b/embodichain/gen_sim/prompt2scene/configs/client_config.json
new file mode 100644
index 000000000..b8662eaf2
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/configs/client_config.json
@@ -0,0 +1,21 @@
+{
+ "sam3_segmentation": {
+ "base_url": "http://192.168.3.23:5014",
+ "timeout_s": 1200,
+ "health_path": "/health",
+ "segment_single_object_path": "/predict"
+ },
+ "sam3d_generation": {
+ "base_url": "http://10.7.7.32:5019",
+ "timeout_s": 1800,
+ "health_path": "/health",
+ "generate_multiple_objects_path": "/generate_multiple_objects",
+ "generate_single_object_path": "/generate_single_object"
+ },
+ "zimage": {
+ "base_url": "http://192.168.3.23:5013",
+ "timeout_s": 120,
+ "health_path": "/health",
+ "generate_single_object_path": "/generate.png"
+ }
+}
diff --git a/embodichain/gen_sim/prompt2scene/configs/llm_config.json b/embodichain/gen_sim/prompt2scene/configs/llm_config.json
new file mode 100644
index 000000000..9dd825143
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/configs/llm_config.json
@@ -0,0 +1,11 @@
+{
+ "llm": {
+ "openai_compatible": {
+ "api_key": "",
+ "model": "",
+ "base_url": "",
+ "default_query": {},
+ "max_attempts": 5
+ }
+ }
+}
diff --git a/embodichain/gen_sim/prompt2scene/llms/__init__.py b/embodichain/gen_sim/prompt2scene/llms/__init__.py
new file mode 100644
index 000000000..8412eff44
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/__init__.py
@@ -0,0 +1,31 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.llms.openai_compatible import (
+ DEFAULT_LLM_CONFIG_PATH,
+ build_chat_model,
+ load_llm_config,
+)
+
+__all__ = [
+ "DEFAULT_LLM_CONFIG_PATH",
+ "OpenAICompatibleLLMCfg",
+ "build_chat_model",
+ "load_llm_config",
+]
diff --git a/embodichain/gen_sim/prompt2scene/llms/config.py b/embodichain/gen_sim/prompt2scene/llms/config.py
new file mode 100644
index 000000000..f84c4fcf9
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/config.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+__all__ = [
+ "OpenAICompatibleLLMCfg",
+]
+
+
+@dataclass(frozen=True)
+class OpenAICompatibleLLMCfg:
+ """OpenAI-compatible LLM configuration."""
+
+ api_key: str
+ model: str
+ base_url: str
+ default_query: dict[str, str] = field(default_factory=dict)
+ max_attempts: int = 3
+
+ def to_manifest(self) -> dict[str, object]:
+ """Convert the LLM config to a JSON-safe manifest.
+
+ Returns:
+ LLM config metadata with sensitive values removed.
+ """
+ return {
+ "provider": "openai_compatible",
+ "model": self.model,
+ "base_url": self.base_url,
+ "has_api_key": bool(self.api_key),
+ "default_query": self.default_query,
+ "max_attempts": self.max_attempts,
+ }
diff --git a/embodichain/gen_sim/prompt2scene/llms/llm_output.py b/embodichain/gen_sim/prompt2scene/llms/llm_output.py
new file mode 100644
index 000000000..07706a11c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/llm_output.py
@@ -0,0 +1,263 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from typing import Any, Callable
+
+__all__ = [
+ "bind_structured_output",
+ "coerce_json_object_output",
+ "is_model_output_error",
+ "call_structured_json_model_step",
+ "StructuredModelCallError",
+ "validate_json_schema",
+]
+
+
+class StructuredModelCallError(Exception):
+ """Retryable structured-model call failure."""
+
+ def __init__(
+ self,
+ *,
+ context: str,
+ attempt_count: int,
+ original_exc: Exception,
+ ) -> None:
+ self.context = context
+ self.attempt_count = attempt_count
+ self.original_exc = original_exc
+ super().__init__(str(original_exc))
+
+
+def bind_structured_output(llm: Any, schema: dict[str, Any]) -> Any:
+ """Bind a JSON schema to an LLM when the model wrapper supports it."""
+ if hasattr(llm, "with_structured_output"):
+ return llm.with_structured_output(schema)
+ return llm
+
+
+def coerce_json_object_output(response: Any, *, context: str) -> dict[str, Any]:
+ """Coerce a model response into a JSON object."""
+ if isinstance(response, dict):
+ return response
+
+ content = getattr(response, "content", response)
+ if isinstance(content, dict):
+ return content
+
+ if isinstance(content, list):
+ text_parts = [
+ item.get("text", "")
+ for item in content
+ if isinstance(item, dict) and item.get("type") == "text"
+ ]
+ content = "\n".join(text_parts)
+
+ if isinstance(content, str):
+ return _parse_json_text(content, context=context)
+
+ raise ValueError(f"{context} model output has unsupported type: {type(response)!r}")
+
+
+def is_model_output_error(exc: Exception) -> bool:
+ """Return whether an exception is a retryable model output formatting error."""
+ class_name = exc.__class__.__name__
+ module_name = exc.__class__.__module__
+ return (
+ class_name
+ in {
+ "JSONDecodeError",
+ "OutputParserException",
+ "SchemaValidationError",
+ "ValidationError",
+ "StructuredModelCallError",
+ }
+ or module_name.startswith("pydantic")
+ )
+
+
+def validate_json_schema(
+ value: Any,
+ schema: dict[str, Any],
+ *,
+ context: str,
+) -> None:
+ """Validate model output against the subset of JSON Schema used locally."""
+ _validate_schema_value(value, schema, path=context)
+
+
+def call_structured_json_model_step(
+ *,
+ llm: Any,
+ schema: dict[str, Any],
+ messages: list[dict[str, Any]],
+ context: str,
+ attempt_count: int,
+ raw_output_writer: Callable[[dict[str, Any]], None] | None = None,
+) -> dict[str, Any]:
+ """Call a structured-output model, validate JSON, and persist raw output."""
+ model = bind_structured_output(llm, schema)
+ try:
+ response = model.invoke(messages)
+ raw_model_output = coerce_json_object_output(response, context=context)
+ validate_json_schema(
+ raw_model_output,
+ schema,
+ context=f"{context} output",
+ )
+ except Exception as exc:
+ if is_model_output_error(exc) or isinstance(exc, ValueError):
+ raise StructuredModelCallError(
+ context=context,
+ attempt_count=attempt_count,
+ original_exc=exc,
+ ) from exc
+ raise
+
+ if raw_output_writer is not None:
+ raw_output_writer(raw_model_output)
+ return raw_model_output
+
+
+def _parse_json_text(content: str, *, context: str) -> dict[str, Any]:
+ stripped = content.strip()
+ if stripped.startswith("```"):
+ lines = stripped.splitlines()
+ if lines and lines[0].startswith("```"):
+ lines = lines[1:]
+ if lines and lines[-1].startswith("```"):
+ lines = lines[:-1]
+ stripped = "\n".join(lines).strip()
+ parsed = json.loads(stripped)
+ if not isinstance(parsed, dict):
+ raise ValueError(f"{context} model output must be a JSON object.")
+ return parsed
+
+
+def _validate_schema_value(value: Any, schema: dict[str, Any], *, path: str) -> None:
+ expected_type = schema.get("type")
+ if expected_type is not None:
+ _validate_type(value, expected_type, path=path)
+
+ enum_values = schema.get("enum")
+ if isinstance(enum_values, list) and value not in enum_values:
+ raise ValueError(f"{path} must be one of {enum_values}.")
+
+ if expected_type == "object" or isinstance(value, dict):
+ _validate_object(value, schema, path=path)
+ elif expected_type == "array" or isinstance(value, list):
+ _validate_array(value, schema, path=path)
+ elif expected_type == "string" or isinstance(value, str):
+ _validate_string(value, schema, path=path)
+ elif expected_type in {"integer", "number"}:
+ _validate_number(value, schema, path=path)
+
+
+def _validate_type(value: Any, expected_type: Any, *, path: str) -> None:
+ if isinstance(expected_type, list):
+ if any(_matches_type(value, item) for item in expected_type):
+ return
+ raise ValueError(f"{path} must match one of these types: {expected_type}.")
+
+ if not _matches_type(value, expected_type):
+ raise ValueError(f"{path} must be {expected_type}.")
+
+
+def _matches_type(value: Any, expected_type: str) -> bool:
+ if expected_type == "object":
+ return isinstance(value, dict)
+ if expected_type == "array":
+ return isinstance(value, list)
+ if expected_type == "string":
+ return isinstance(value, str)
+ if expected_type == "integer":
+ return isinstance(value, int) and not isinstance(value, bool)
+ if expected_type == "number":
+ return isinstance(value, int | float) and not isinstance(value, bool)
+ if expected_type == "boolean":
+ return isinstance(value, bool)
+ if expected_type == "null":
+ return value is None
+ return True
+
+
+def _validate_object(value: Any, schema: dict[str, Any], *, path: str) -> None:
+ if not isinstance(value, dict):
+ return
+
+ properties = schema.get("properties")
+ properties = properties if isinstance(properties, dict) else {}
+
+ required = schema.get("required", [])
+ if isinstance(required, list):
+ missing = [key for key in required if key not in value]
+ if missing:
+ raise ValueError(f"{path} missing required keys: {missing}.")
+
+ if schema.get("additionalProperties") is False:
+ extra = sorted(set(value) - set(properties))
+ if extra:
+ raise ValueError(f"{path} has unexpected keys: {extra}.")
+
+ for key, child_schema in properties.items():
+ if key not in value or not isinstance(child_schema, dict):
+ continue
+ _validate_schema_value(value[key], child_schema, path=f"{path}.{key}")
+
+
+def _validate_array(value: Any, schema: dict[str, Any], *, path: str) -> None:
+ if not isinstance(value, list):
+ return
+
+ min_items = schema.get("minItems")
+ if isinstance(min_items, int) and len(value) < min_items:
+ raise ValueError(f"{path} must contain at least {min_items} items.")
+
+ max_items = schema.get("maxItems")
+ if isinstance(max_items, int) and len(value) > max_items:
+ raise ValueError(f"{path} must contain at most {max_items} items.")
+
+ items_schema = schema.get("items")
+ if not isinstance(items_schema, dict):
+ return
+
+ for index, item in enumerate(value):
+ _validate_schema_value(item, items_schema, path=f"{path}[{index}]")
+
+
+def _validate_string(value: Any, schema: dict[str, Any], *, path: str) -> None:
+ if not isinstance(value, str):
+ return
+
+ min_length = schema.get("minLength")
+ if isinstance(min_length, int) and len(value) < min_length:
+ raise ValueError(f"{path} must contain at least {min_length} characters.")
+
+ max_length = schema.get("maxLength")
+ if isinstance(max_length, int) and len(value) > max_length:
+ raise ValueError(f"{path} must contain at most {max_length} characters.")
+
+
+def _validate_number(value: Any, schema: dict[str, Any], *, path: str) -> None:
+ if not isinstance(value, int | float) or isinstance(value, bool):
+ return
+
+ minimum = schema.get("minimum")
+ if isinstance(minimum, int | float) and value < minimum:
+ raise ValueError(f"{path} must be greater than or equal to {minimum}.")
diff --git a/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py b/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py
new file mode 100644
index 000000000..91e94a594
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/llms/openai_compatible.py
@@ -0,0 +1,115 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+from langchain_openai import ChatOpenAI
+
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+
+__all__ = ["DEFAULT_LLM_CONFIG_PATH", "build_chat_model", "load_llm_config"]
+
+DEFAULT_LLM_CONFIG_PATH = (
+ Path(__file__).resolve().parents[1] / "configs" / "llm_config.json"
+)
+
+
+def load_llm_config(config_path: Path | None = None) -> OpenAICompatibleLLMCfg:
+ """Load the prompt2scene OpenAI-compatible LLM config.
+
+ Args:
+ config_path: Optional path to the LLM config JSON file.
+
+ Returns:
+ Parsed OpenAI-compatible LLM config.
+
+ Raises:
+ FileNotFoundError: If the config file does not exist.
+ ValueError: If required config fields are missing.
+ """
+ config_path = config_path or DEFAULT_LLM_CONFIG_PATH
+ config_path = config_path.expanduser().resolve()
+
+ if not config_path.exists():
+ raise FileNotFoundError(f"LLM config not found: {config_path}")
+
+ with config_path.open("r", encoding="utf-8") as f:
+ raw_cfg: dict[str, Any] = json.load(f)
+
+ cfg = raw_cfg.get("llm", {}).get("openai_compatible", {})
+ api_key = os.getenv("OPENAI_API_KEY") or cfg.get("api_key", "")
+ model = os.getenv("OPENAI_MODEL") or cfg.get("model", "")
+ base_url = os.getenv("OPENAI_BASE_URL") or cfg.get("base_url", "")
+ default_query = cfg.get("default_query", {})
+ max_attempts = _load_positive_int(
+ os.getenv("OPENAI_MAX_ATTEMPTS") or cfg.get("max_attempts", 3),
+ key="max_attempts",
+ )
+
+ if base_url:
+ base_url = base_url.rstrip("/")
+
+ missing = [
+ name
+ for name, value in {
+ "api_key": api_key,
+ "model": model,
+ "base_url": base_url,
+ }.items()
+ if not value
+ ]
+ if missing:
+ raise ValueError(f"Missing required LLM config keys: {missing}")
+
+ if not isinstance(default_query, dict):
+ raise ValueError("LLM config key default_query must be a dict.")
+
+ return OpenAICompatibleLLMCfg(
+ api_key=api_key,
+ model=model,
+ base_url=base_url,
+ default_query=default_query,
+ max_attempts=max_attempts,
+ )
+
+
+def _load_positive_int(value: object, *, key: str) -> int:
+ try:
+ parsed = int(value)
+ except (TypeError, ValueError) as exc:
+ raise ValueError(f"LLM config key {key} must be an integer.") from exc
+ if parsed < 1:
+ raise ValueError(f"LLM config key {key} must be >= 1.")
+ return parsed
+
+
+def build_chat_model(cfg: OpenAICompatibleLLMCfg) -> Any:
+ """Build a LangChain OpenAI-compatible chat model."""
+ kwargs: dict[str, Any] = {
+ "api_key": cfg.api_key,
+ "base_url": cfg.base_url,
+ "model": cfg.model,
+ "temperature": 0,
+ }
+ if cfg.default_query:
+ kwargs["default_query"] = cfg.default_query
+
+ return ChatOpenAI(**kwargs)
diff --git a/embodichain/gen_sim/prompt2scene/pipeline/__init__.py b/embodichain/gen_sim/prompt2scene/pipeline/__init__.py
new file mode 100644
index 000000000..a1450f03c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/pipeline/__init__.py
@@ -0,0 +1,25 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.pipeline.runner import (
+ Prompt2SceneRunResult,
+ run_prompt2scene,
+)
+
+__all__ = ["Prompt2SceneRunResult", "run_prompt2scene"]
+
diff --git a/embodichain/gen_sim/prompt2scene/pipeline/runner.py b/embodichain/gen_sim/prompt2scene/pipeline/runner.py
new file mode 100644
index 000000000..1e651c3e3
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/pipeline/runner.py
@@ -0,0 +1,243 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from embodichain.gen_sim.prompt2scene.llms import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.workflows.request import (
+ InputKind,
+ Prompt2SceneInput,
+)
+from embodichain.gen_sim.prompt2scene.workflows.paths import (
+ IMAGE_SEGMENTS_STEP,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ SCENE_EDIT_STEP,
+ SCENE_INTAKE_STEP,
+ UNIFIED_SCENE_STEP,
+ PipelinePaths,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ write_step_result,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.graph import (
+ run_unified_scene,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.graph import (
+ run_unified_scene_gen,
+)
+from embodichain.gen_sim.prompt2scene.workflows.gym_export import (
+ export_gym_config,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit import run_scene_edit
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.schema import (
+ SceneEditRequest,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.image_relations import (
+ run_image_relations,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake import run_scene_intake
+
+__all__ = [
+ "IMAGE_SEGMENTS_DIRNAME",
+ "IMAGE_SPATIAL_RELATIONS_DIRNAME",
+ "INPUT_MANIFEST_FILENAME",
+ "SCENE_EDIT_DIRNAME",
+ "SCENE_INTAKE_DIRNAME",
+ "STEP_RESULT_FILENAME",
+ "UNIFIED_SCENE_DIRNAME",
+ "Prompt2SceneRunResult",
+ "run_prompt2scene",
+]
+
+INPUT_MANIFEST_FILENAME = "input_manifest.json"
+SCENE_INTAKE_DIRNAME = SCENE_INTAKE_STEP
+SCENE_EDIT_DIRNAME = SCENE_EDIT_STEP
+IMAGE_SEGMENTS_DIRNAME = IMAGE_SEGMENTS_STEP
+IMAGE_SPATIAL_RELATIONS_DIRNAME = IMAGE_SPATIAL_RELATIONS_STEP
+UNIFIED_SCENE_DIRNAME = UNIFIED_SCENE_STEP
+
+
+@dataclass(frozen=True)
+class Prompt2SceneRunResult:
+ """Result returned by the prompt2scene runner.
+
+ Args:
+ output_root: Directory where prompt2scene outputs were written.
+ manifest_path: Path to the serialized input manifest.
+ scene_intake_path: Path to the serialized scene intake output.
+ image_segments_path: Path to serialized image segment alignment output.
+ image_spatial_relations_path: Path to serialized image spatial relations.
+ unified_scene_path: Path to serialized unified scene output.
+ gym_config_path: Path to the exported gym config.
+ scene_edit_path: Path to serialized scene edit output.
+ """
+
+ output_root: Path
+ manifest_path: Path
+ scene_intake_path: Path | None = None
+ image_segments_path: Path | None = None
+ image_spatial_relations_path: Path | None = None
+ unified_scene_path: Path | None = None
+ gym_config_path: Path | None = None
+ scene_edit_path: Path | None = None
+
+
+def run_prompt2scene(
+ request: Prompt2SceneInput,
+ llm_cfg: OpenAICompatibleLLMCfg | None = None,
+) -> Prompt2SceneRunResult:
+ """Run the prompt2scene pipeline.
+
+ This runner creates the output directory, writes the parsed input manifest,
+ and runs fixed VLM-based scene intake when an LLM config is provided.
+
+ Args:
+ request: Parsed prompt2scene input.
+ llm_cfg: Optional LLM config used by later pipeline stages.
+
+ Returns:
+ Paths created by the runner.
+ """
+ log.log_info(
+ "run start "
+ f"input_kind={request.input_kind.value} output_root={request.output_root}"
+ )
+ request.output_root.mkdir(parents=True, exist_ok=True)
+ paths = PipelinePaths(request.output_root)
+ manifest_path = request.output_root / INPUT_MANIFEST_FILENAME
+ manifest = request.to_manifest()
+ if llm_cfg is not None:
+ manifest["llm"] = llm_cfg.to_manifest()
+ write_json(manifest_path, manifest)
+
+ scene_intake_path = None
+ image_segments_path = None
+ image_spatial_relations_path = None
+ unified_scene_path = None
+ gym_config_path = None
+ scene_edit_path = None
+ if request.input_kind == InputKind.EDIT:
+ log.log_info("step start scene_edit")
+ run_scene_edit(
+ SceneEditRequest(
+ output_root=request.output_root,
+ prompt=request.prompt or "",
+ ),
+ llm_cfg=llm_cfg,
+ )
+ scene_edit_path = paths.step_result(SCENE_EDIT_STEP)
+ log.log_info(
+ f"step end scene_edit status=pending_implementation output={scene_edit_path}"
+ )
+ elif llm_cfg is not None:
+ log.log_info("step start scene_intake")
+ scene_intake = run_scene_intake(request, llm_cfg=llm_cfg)
+ scene_intake_path = write_step_result(
+ request.output_root,
+ SCENE_INTAKE_STEP,
+ scene_intake.to_manifest(),
+ )
+ log.log_info(
+ f"step end scene_intake status=ok output={scene_intake_path}"
+ )
+ if request.input_kind != InputKind.IMAGE:
+ raise ValueError(
+ f"Unsupported prompt2scene input_kind: {request.input_kind.value!r}."
+ )
+ log.log_info("step start image_relations")
+ image_relations = run_image_relations(
+ request,
+ scene_intake=scene_intake,
+ llm_cfg=llm_cfg,
+ output_root=request.output_root,
+ )
+ image_segments_path = paths.step_result(
+ IMAGE_SEGMENTS_STEP,
+ )
+ if not image_segments_path.is_file():
+ write_step_result(
+ request.output_root,
+ IMAGE_SEGMENTS_STEP,
+ image_relations.to_segmentation_manifest(),
+ )
+ image_spatial_relations_path = paths.step_result(
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ )
+ if not image_spatial_relations_path.is_file():
+ write_step_result(
+ request.output_root,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ image_relations.to_spatial_manifest(),
+ )
+ log.log_info(
+ "step end image_relations "
+ f"status=ok output={image_spatial_relations_path}"
+ )
+ log.log_info("step start unified_scene")
+ unified_scene = run_unified_scene(
+ request,
+ scene_intake=scene_intake,
+ image_relations=image_relations,
+ output_root=request.output_root,
+ )
+ unified_scene_path = paths.step_result(
+ UNIFIED_SCENE_STEP,
+ )
+ log.log_info(
+ f"step end unified_scene status=ok output={unified_scene_path}"
+ )
+ log.log_info("step start unified_scene_gen")
+ run_unified_scene_gen(
+ request.output_root,
+ unified_scene_result_path=unified_scene_path,
+ llm_cfg=llm_cfg,
+ )
+ log.log_info("step end unified_scene_gen status=ok")
+
+ log.log_info("step start gym_export")
+ gym_config_path = export_gym_config(request.output_root)
+ log.log_info(f"step end gym_export status=ok output={gym_config_path}")
+ if request.prompt:
+ log.log_info("step start scene_edit")
+ run_scene_edit(
+ SceneEditRequest(
+ output_root=request.output_root,
+ prompt=request.prompt,
+ ),
+ llm_cfg=llm_cfg,
+ )
+ scene_edit_path = paths.step_result(SCENE_EDIT_STEP)
+ log.log_info(
+ f"step end scene_edit status=pending_implementation output={scene_edit_path}"
+ )
+
+ log.log_info(f"run end output_root={request.output_root}")
+
+ return Prompt2SceneRunResult(
+ output_root=request.output_root,
+ manifest_path=manifest_path,
+ scene_intake_path=scene_intake_path,
+ image_segments_path=image_segments_path,
+ image_spatial_relations_path=image_spatial_relations_path,
+ unified_scene_path=unified_scene_path,
+ gym_config_path=gym_config_path,
+ scene_edit_path=scene_edit_path,
+ )
diff --git a/embodichain/gen_sim/prompt2scene/prompts/__init__.py b/embodichain/gen_sim/prompt2scene/prompts/__init__.py
new file mode 100644
index 000000000..f772b5399
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/__init__.py
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from . import data
+from .base import PromptRenderer
+
+default_prompt_renderer = PromptRenderer(data)
+
+__all__ = [
+ "load_prompt",
+ "load_prompt_data",
+ "render_prompt",
+ "default_prompt_renderer",
+]
+
+
+def load_prompt(prompt_name: str) -> str:
+ """Load a prompt template from the bundled prompt data directory."""
+ return default_prompt_renderer.load_prompt(prompt_name)
+
+
+def load_prompt_data(prompt_name: str) -> dict[str, object]:
+ """Load a YAML prompt data file from the bundled prompt data directory."""
+ return default_prompt_renderer.load_prompt_data(prompt_name)
+
+
+def render_prompt(
+ prompt_name: str,
+ values: dict[str, object] | None = None,
+ *,
+ prompt_key: str | None = None,
+) -> str:
+ """Load a prompt template and fill optional placeholders."""
+ return default_prompt_renderer.render_prompt(
+ prompt_name,
+ values,
+ prompt_key=prompt_key,
+ )
diff --git a/embodichain/gen_sim/prompt2scene/prompts/base.py b/embodichain/gen_sim/prompt2scene/prompts/base.py
new file mode 100644
index 000000000..a145735cb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/base.py
@@ -0,0 +1,79 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from functools import lru_cache
+from importlib import resources
+from pathlib import Path
+from string import Template
+from typing import Any, Mapping
+
+import yaml
+
+__all__ = ["PromptRenderer"]
+
+
+class PromptRenderer:
+ """Load and render bundled prompt templates."""
+
+ def __init__(self, package: Any) -> None:
+ self._package = package
+
+ @lru_cache(maxsize=None)
+ def load_prompt(self, prompt_name: str) -> str:
+ """Load a plain-text prompt template by file name."""
+ prompt_path = self._get_prompt_path(prompt_name)
+ if not prompt_path.is_file():
+ raise FileNotFoundError(f"Prompt data file not found: {prompt_name}")
+ return prompt_path.read_text(encoding="utf-8").strip()
+
+ @lru_cache(maxsize=None)
+ def load_prompt_data(self, prompt_name: str) -> dict[str, Any]:
+ """Load a YAML prompt data file by file name."""
+ prompt_path = self._get_prompt_path(prompt_name)
+ if not prompt_path.is_file():
+ raise FileNotFoundError(f"Prompt data file not found: {prompt_name}")
+
+ prompt_data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
+ if not isinstance(prompt_data, dict):
+ raise ValueError(f"Prompt YAML must contain a mapping: {prompt_name}")
+ return prompt_data
+
+ def render_prompt(
+ self,
+ prompt_name: str,
+ values: Mapping[str, object] | None = None,
+ *,
+ prompt_key: str | None = None,
+ ) -> str:
+ """Render a prompt template and fill placeholders."""
+ if prompt_key is None:
+ template = self.load_prompt(prompt_name)
+ else:
+ prompt_data = self.load_prompt_data(prompt_name)
+ template = prompt_data.get(prompt_key)
+ if not isinstance(template, str):
+ raise KeyError(f"Prompt key {prompt_key!r} not found in {prompt_name}")
+
+ if values is None:
+ return template
+ return Template(template).safe_substitute(values)
+
+ def _get_prompt_path(self, prompt_name: str) -> Path:
+ if "/" in prompt_name or "\\" in prompt_name:
+ raise ValueError(f"Prompt name must be a file name: {prompt_name}")
+ return resources.files(self._package).joinpath(prompt_name)
diff --git a/embodichain/gen_sim/prompt2scene/prompts/builders.py b/embodichain/gen_sim/prompt2scene/prompts/builders.py
new file mode 100644
index 000000000..cae50b582
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/builders.py
@@ -0,0 +1,392 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts import render_prompt
+from embodichain.gen_sim.prompt2scene.utils.io import image_to_data_url
+
+__all__ = [
+ "build_filter_extra_instances_messages",
+ "build_image_metric_scale_messages",
+ "build_scene_intake_messages",
+ "build_scene_intake_verifier_messages",
+ "build_scene_edit_intent_messages",
+ "build_spatial_layout_messages",
+ "build_spatial_layout_verifier_messages",
+ "build_text_metric_scale_messages",
+ "build_up_down_flip_check_messages",
+]
+
+
+SCENE_INTAKE_PROMPT = "scene_intake.yaml"
+SCENE_EDIT_PROMPT = "scene_edit.yaml"
+IMAGE_RELATIONS_PROMPT = "image_relations.yaml"
+UNIFIED_SCENE_GEN_PROMPT = "unified_scene_gen.yaml"
+
+
+
+def build_scene_intake_messages(request: Prompt2SceneInput) -> list[dict[str, Any]]:
+ """Build LangChain-compatible messages for scene intake."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ SCENE_INTAKE_PROMPT, prompt_key="image_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ SCENE_INTAKE_PROMPT, prompt_key="image_user"
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(request.image_path)},
+ },
+ ],
+ },
+ ]
+
+
+def build_scene_intake_verifier_messages(
+ *,
+ request: Prompt2SceneInput,
+ scene_intake: SceneIntakeSpec,
+) -> list[dict[str, Any]]:
+ """Build messages for scene-intake group and count verification."""
+
+ table_draft: dict[str, object] = {
+ "name": scene_intake.table.name,
+ "description": scene_intake.table.description,
+ "complete_table_description": (
+ scene_intake.table.complete_table_description
+ ),
+ "is_complete_visible_table": scene_intake.table.is_complete_visible_table,
+ "class_candidate": list(scene_intake.table.class_candidate),
+ }
+ if scene_intake.table.object_coverage_percent is not None:
+ table_draft["object_coverage_percent"] = (
+ scene_intake.table.object_coverage_percent
+ )
+ scene_intake_json = json.dumps(
+ {
+ "table": table_draft,
+ "assets": [
+ {
+ "name": asset.name,
+ "description": asset.description,
+ "class_candidate": list(asset.class_candidate),
+ "count": asset.count,
+ }
+ for asset in scene_intake.assets
+ ],
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+
+ image_path = request.image_path
+ if image_path is None:
+ raise ValueError("Image input requires image_path.")
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ SCENE_INTAKE_PROMPT, prompt_key="verifier_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ SCENE_INTAKE_PROMPT,
+ {"scene_intake_json": scene_intake_json},
+ prompt_key="verifier_image_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(image_path)},
+ },
+ ],
+ },
+ ]
+
+
+
+
+def build_filter_extra_instances_messages(
+ *,
+ debug_image_path: Path,
+ name: str,
+ description: str,
+ expected_count: int,
+ class_candidate: list[str],
+) -> list[dict[str, Any]]:
+ """Build LangChain-compatible messages for VLM extra-mask filtering."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ IMAGE_RELATIONS_PROMPT, prompt_key="filter_extra_instances_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ IMAGE_RELATIONS_PROMPT,
+ {
+ "name": name.replace("_", " "),
+ "description": description,
+ "expected_count": str(expected_count),
+ "class_candidate": ", ".join(
+ candidate.replace("_", " ")
+ for candidate in class_candidate
+ ),
+ },
+ prompt_key="filter_extra_instances_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(debug_image_path)},
+ },
+ ],
+ },
+ ]
+
+
+def build_spatial_layout_messages(
+ *,
+ bbox_name_image_path: Path,
+ asset_ids: list[str],
+) -> list[dict[str, Any]]:
+ """Build messages for VLM spatial ordering and object-state extraction."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ IMAGE_RELATIONS_PROMPT, prompt_key="spatial_layout_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ IMAGE_RELATIONS_PROMPT,
+ {
+ "asset_ids": "\n".join(
+ f"- {asset_id}" for asset_id in asset_ids
+ ),
+ },
+ prompt_key="spatial_layout_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+ },
+ ],
+ },
+ ]
+
+
+def build_spatial_layout_verifier_messages(
+ *,
+ bbox_name_image_path: Path,
+ asset_ids: list[str],
+ draft_spatial_layout_json: str,
+) -> list[dict[str, Any]]:
+ """Build messages for VLM spatial ordering verification."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ IMAGE_RELATIONS_PROMPT,
+ prompt_key="spatial_layout_verifier_system",
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ IMAGE_RELATIONS_PROMPT,
+ {
+ "asset_ids": "\n".join(
+ f"- {asset_id}" for asset_id in asset_ids
+ ),
+ "draft_spatial_layout_json": draft_spatial_layout_json,
+ },
+ prompt_key="spatial_layout_verifier_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+ },
+ ],
+ },
+ ]
+
+
+def build_scene_edit_intent_messages(
+ *,
+ prompt: str,
+ scene_objects: list[dict[str, Any]],
+ current_relations: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ """Build messages for editing an existing scene from a user prompt."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ SCENE_EDIT_PROMPT,
+ prompt_key="intent_system",
+ ),
+ },
+ {
+ "role": "user",
+ "content": render_prompt(
+ SCENE_EDIT_PROMPT,
+ {
+ "prompt": prompt,
+ "scene_objects_json": json.dumps(
+ scene_objects, ensure_ascii=False, indent=2
+ ),
+ },
+ prompt_key="intent_user",
+ ),
+ },
+ ]
+
+def build_image_metric_scale_messages(
+ *,
+ bbox_name_image_path: Path,
+ objects_json: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ """Build messages for image-scene object metric scale estimation."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT, prompt_key="image_metric_scale_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT,
+ {
+ "objects_json": json.dumps(
+ objects_json, ensure_ascii=False, indent=2
+ ),
+ },
+ prompt_key="image_metric_scale_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(bbox_name_image_path)},
+ },
+ ],
+ },
+ ]
+
+
+def build_text_metric_scale_messages(
+ *,
+ user_text: str,
+ objects_json: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ """Build messages for text-scene object metric scale estimation."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT, prompt_key="text_metric_scale_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT,
+ {
+ "user_text": user_text,
+ "objects_json": json.dumps(
+ objects_json, ensure_ascii=False, indent=2
+ ),
+ },
+ prompt_key="text_metric_scale_user",
+ ),
+ },
+ ]
+
+
+def build_up_down_flip_check_messages(
+ *,
+ original_image_path: Path,
+ comparison_image_path: Path,
+) -> list[dict[str, Any]]:
+ """Build messages for VLM support-normal up/down flip verification."""
+ return [
+ {
+ "role": "system",
+ "content": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT, prompt_key="up_down_flip_check_system"
+ ),
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": render_prompt(
+ UNIFIED_SCENE_GEN_PROMPT,
+ prompt_key="up_down_flip_check_user",
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(original_image_path)},
+ },
+ {
+ "type": "image_url",
+ "image_url": {"url": image_to_data_url(comparison_image_path)},
+ },
+ ],
+ },
+ ]
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py b/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py
new file mode 100644
index 000000000..96d642123
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/__init__.py
@@ -0,0 +1,21 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Bundled prompt template data files."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml
new file mode 100644
index 000000000..a6e4082f7
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/image_relations.yaml
@@ -0,0 +1,339 @@
+name: image_relations
+version: 1
+
+filter_extra_instances_system: |
+
+ You are a careful image segmentation verification assistant for tabletop scenes.
+
+
+
+ You will receive:
+ - One target object class name.
+ - One target object description.
+ - The expected number of target instances.
+ - A short candidate class list for that target object.
+ - One image with numbered colored mask outlines drawn over candidate
+ segmentation results for that target object.
+
+ Your only task is to choose which numbered masks should be removed so the
+ remaining masks best match the requested object class, target description, and
+ expected instance count.
+
+ This is not a scene-description task and not a spatial-relation task.
+ Do not describe the scene. Do not infer object-object relations. Do not rename
+ the requested object class. Do not add new masks.
+
+
+
+ - The colored mask outlines and red number labels are annotations only. Do NOT
+ treat annotation colors as the object's real color, material, texture, label,
+ logo, or surface appearance.
+ - The red number label is only an approximate index marker. It may sit near,
+ above, or partly over a neighboring or occluding object. Do NOT decide mask
+ ownership from where the number label is drawn.
+ - Decide whether a candidate is correct from the region enclosed by the colored
+ mask outline, not from the label position.
+ - Judge the object's real appearance from the underlying original image pixels
+ inside and around each outline.
+ - Use the target object class name as the primary class.
+ - Use the target description to distinguish visually similar objects from the
+ same broad category.
+ - Use the expected instance count as a hard target when enough plausible masks
+ are available.
+ - Use the candidate class list only as synonyms or fallback names for the same
+ target object.
+ - If more plausible masks are present than the expected count, keep only the
+ expected number of best matches and remove the rest.
+ - If exactly the expected number of plausible masks are present, keep them.
+ - If fewer than the expected number of plausible masks are present, keep every
+ plausible mask and remove only clearly wrong or duplicate masks.
+ - Remove a numbered mask if it clearly covers a different object class.
+ - Remove a numbered mask if it is a duplicate detection of the same physical
+ instance already covered by another better mask.
+ - Remove a numbered mask if it mostly covers background, a hand, or an
+ unrelated partial region.
+ - Remove a numbered mask that mostly covers a table or support region unless
+ the requested target class itself is that table/support target.
+ - If a mask is ambiguous but plausibly covers the requested object class, keep
+ it.
+
+
+
+ {
+ "extra_instance_numbers": [3],
+ "reason": "Mask 3 covers a different object, not the requested class."
+ }
+
+
+
+ Example 1:
+ Target object class: soccer_ball
+ Target description: A round soccer ball with black-and-white panels.
+ Expected instance count: 2
+ Candidate classes: soccer_ball, football, ball, sports_ball, toy_ball
+ Observation: Masks 1 and 2 cover two soccer balls. Mask 3 covers a paper cup.
+ Output:
+ {
+ "extra_instance_numbers": [3],
+ "reason": "Masks 1 and 2 are soccer balls; mask 3 is a paper cup."
+ }
+
+ Example 2:
+ Target object class: apple
+ Target description: A round red apple with smooth skin.
+ Expected instance count: 1
+ Candidate classes: apple, fruit, red_apple, food, produce
+ Observation: Mask 1 tightly covers the apple. Mask 2 overlaps the same apple and
+ is a duplicate looser detection.
+ Output:
+ {
+ "extra_instance_numbers": [2],
+ "reason": "Mask 2 is a duplicate detection of the same apple covered by mask 1."
+ }
+
+ Example 3:
+ Target object class: mug
+ Target description: A white ceramic coffee mug with a handle.
+ Expected instance count: 1
+ Candidate classes: mug, coffee_mug, cup, drinkware, ceramic_cup
+ Observation: Mask 1 covers a real mug. Mask 2 covers a bowl.
+ Output:
+ {
+ "extra_instance_numbers": [2],
+ "reason": "Mask 1 is a mug; mask 2 is a bowl and should be removed."
+ }
+
+ Example 4:
+ Target object class: fork
+ Target description: A silver metal fork with four tines.
+ Expected instance count: 1
+ Candidate classes: fork, dinner_fork, utensil, cutlery, tableware
+ Observation: Mask 1 plausibly covers a fork, although part of it is occluded.
+ Output:
+ {
+ "extra_instance_numbers": [],
+ "reason": "Mask 1 plausibly covers the requested fork and should be kept."
+ }
+
+
+
+ - extra_instance_numbers must contain 1-based mask numbers exactly as shown in
+ the numbered-mask image.
+ - If no masks should be removed, output an empty list.
+ - Output JSON only. Do not include markdown or explanations outside JSON.
+
+
+filter_extra_instances_user: |
+ Verify the numbered segmentation masks for this object class:
+
+
+ Target object class: $name
+ Target description: $description
+ Expected instance count: $expected_count
+ Candidate classes: $class_candidate
+
+
+
+ Inspect the numbered-mask-outline image.
+ The colored outlines and number labels are annotations only, not real object
+ colors or textures.
+ The red number label is only an approximate index marker; judge each candidate
+ by the object region enclosed by that numbered outline, not by where the label
+ happens to be placed.
+ Return the 1-based numbers of masks that should be removed so the remaining
+ masks best match the target description and expected instance count.
+
+
+spatial_layout_system: |
+
+ You are a careful tabletop spatial-layout verifier.
+
+
+
+ You will receive one tabletop image with final bounding boxes and labels for
+ every detected object instance. Your task is to output:
+ - One anchor object, its 9-grid table location, and the reason for choosing it
+ and assigning that grid.
+ - Object groups ordered from left to right.
+ - Object groups ordered from front to back.
+ - Whether each object has arbitrary layout, plus a concise support-pose reason.
+
+ Do not output pairwise left/right/front/behind relations. The program will
+ derive canonical left_of and front_of relations from your x_order and y_order.
+ Use ordered groups conservatively. Prefer fewer relations over a wrong
+ relation.
+
+
+
+ - x_order must be ordered from image/table left to image/table right.
+ - The camera view may be oblique, rotated, or perspective-distorted. Still
+ judge left/right from the visible tabletop image as a human would understand
+ the normal viewing direction of this image, not from object IDs, prompt
+ order, 3D generation order, or arbitrary simulation axes.
+ - The final left_of relations derived from x_order must reflect your visual
+ judgment from the attached image under that normal image/table view.
+ - y_order must be ordered from table front to table back.
+ - Split x_order groups when the left/right order is reasonably clear from the
+ bbox-name image.
+ - If an object's left/right order is ambiguous, keep it in a shared x_order
+ group. Never omit it.
+ - Front/back is especially hard to judge. Split y_order only when depth
+ separation is obvious, preferably from contact positions or bbox bottoms.
+ - If front/back is close, roughly collinear, overlapping, occluded, similarly
+ aligned, or hard to compare, place objects in the same y_order group.
+ - Ordered groups are interpreted as monotonic DAG ranks. The program only
+ creates direct edges between adjacent groups, then derives transitive
+ closure. For example, G1 < G2 < G3 creates direct edges G1 -> G2 and
+ G2 -> G3; G1 -> G3 is implicit.
+
+
+
+ - Choose one clearly visible object as anchor.
+ - Prefer a large, unoccluded object whose 9-grid location is easy to judge.
+ - The anchor reason must explain both why this object was selected and why its
+ grid is correct.
+ - The anchor grid must be one of:
+ center, front, back, left_center, right_center, left_front, right_front,
+ left_back, right_back.
+
+
+
+ - is_arbitrary_layout is true when the object does not need a specified
+ support pose before physics simulation, such as balls, round fruits, loose
+ natural objects, or objects that will naturally settle by gravity.
+ - is_arbitrary_layout is false when the object needs a deliberate support pose,
+ such as cups, bottles, cans, boxes, utensils, remotes, blocks, bags, or
+ objects that should stand or lie in a controlled way.
+ - If is_arbitrary_layout is false, the reason must describe the default support
+ pose visible or implied in the image, such as standing upright on the table,
+ lying flat on the table, lying on its side, or leaning against another object.
+ - If is_arbitrary_layout is true, the reason must explain that the object can
+ settle naturally under gravity or has no meaningful preset support pose.
+
+
+
+ {
+ "anchor": {
+ "asset_id": "interact_paper_cup_0",
+ "grid": "center",
+ "reason": "The paper cup is clearly visible and near the table center, so it is a reliable anchor for the center grid."
+ },
+ "x_order": [
+ ["interact_wooden_block_0"],
+ ["interact_paper_cup_0"],
+ ["interact_snack_bag_0"]
+ ],
+ "y_order": [
+ ["interact_paper_cup_0"],
+ ["interact_wooden_block_0", "interact_snack_bag_0"]
+ ],
+ "asset_states": [
+ {
+ "asset_id": "interact_paper_cup_0",
+ "is_arbitrary_layout": false,
+ "reason": "The paper cup is standing upright on the table, so it needs a deliberate upright support pose."
+ }
+ ]
+ }
+
+
+
+ - Every provided asset_id must appear exactly once in x_order.
+ - Every provided asset_id must appear exactly once in y_order.
+ - Every provided asset_id must appear exactly once in asset_states.
+ - Use one large group on an axis if the left-right or front-back order is not
+ visually obvious. Do not omit uncertain objects.
+ - anchor.asset_id must be one of the provided asset_ids.
+ - anchor.reason and every asset state reason must be concise but explicit.
+ - Only the anchor may have a grid. Do not add grid to asset_states.
+ - Output JSON only. Do not include markdown or explanations outside JSON.
+
+
+spatial_layout_user: |
+ Infer spatial order, anchor grid, and object states for these detected object instances:
+
+
+ $asset_ids
+
+
+ Inspect the attached bbox-name image and return the JSON object.
+
+spatial_layout_verifier_system: |
+
+ You are a strict verifier for tabletop spatial ordering.
+
+
+
+ You will receive one bbox-name tabletop image, the full asset_id list, and a
+ draft spatial layout JSON. Verify whether the draft x_order and y_order are
+ visually correct.
+
+ If the draft is correct, set passed=true and return the same layout in
+ corrected_layout. If any order is wrong or too specific/uncertain, set
+ passed=false and return a corrected complete spatial layout JSON in
+ corrected_layout.
+
+
+
+ - x_order is the source for derived left_of relations. Check it carefully.
+ - The camera view may be oblique, rotated, or perspective-distorted. Still
+ judge left/right from the visible tabletop image as a human would understand
+ the normal viewing direction of this image, not from object IDs, prompt
+ order, 3D generation order, or arbitrary simulation axes.
+ - If a left/right relation is unclear, overlapping, occluded, or visually too
+ close to trust, place the objects in the same x_order group instead of
+ forcing an order.
+ - Check y_order more conservatively than x_order. If front/back is uncertain,
+ place objects in the same y_order group.
+ - Every asset_id must appear exactly once in corrected_layout.x_order,
+ corrected_layout.y_order, and corrected_layout.asset_states.
+ - Preserve or correct anchor and asset_states as needed, but focus primarily
+ on spatial order correctness.
+ - Return JSON only.
+
+
+
+ {
+ "passed": false,
+ "reason": "The draft placed two overlapping objects in a forced left/right order, but the image does not support that relation.",
+ "corrected_layout": {
+ "anchor": {
+ "asset_id": "interact_paper_cup_0",
+ "grid": "center",
+ "reason": "The paper cup is clearly visible near the table center."
+ },
+ "x_order": [
+ ["interact_paper_cup_0", "interact_snack_bag_0"]
+ ],
+ "y_order": [
+ ["interact_paper_cup_0", "interact_snack_bag_0"]
+ ],
+ "asset_states": [
+ {
+ "asset_id": "interact_paper_cup_0",
+ "is_arbitrary_layout": false,
+ "reason": "The paper cup is standing upright and needs an upright support pose."
+ },
+ {
+ "asset_id": "interact_snack_bag_0",
+ "is_arbitrary_layout": false,
+ "reason": "The snack bag should keep a deliberate lying or leaning support pose."
+ }
+ ]
+ }
+ }
+
+
+spatial_layout_verifier_user: |
+ Verify this draft spatial layout for the detected object instances:
+
+
+ $asset_ids
+
+
+
+ $draft_spatial_layout_json
+
+
+ Inspect the attached bbox-name image. Return whether the draft passes, the
+ reason, and a complete corrected_layout JSON.
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/scene_edit.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/scene_edit.yaml
new file mode 100644
index 000000000..1b1a0a025
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/scene_edit.yaml
@@ -0,0 +1,129 @@
+intent_system: |
+ You are a scene-edit intent planner for a tabletop simulation scene.
+
+ You receive:
+ - A user edit prompt.
+ - Existing scene objects. Each object has only id, name, and description.
+
+ Your job is to produce a structured edit intent. Do not generate geometry.
+
+ Rules:
+ - Existing objects must be referenced only by their exact object id.
+ - deleted_object_ids may contain only ids from the existing scene objects.
+ - A replacement is represented as deleting the target object and adding one generated object.
+ - Every added or replacement object must appear in generated_objects with a temp_id, canonical snake_case name, and standalone detailed description suitable for text-to-geometry simready generation.
+ - For delete operations, identify only the existing object ids to delete.
+ - For replacement operations, set type=replace, set target_object_id to the existing object being replaced, create exactly one generated object, and set placement.type=preserve_target unless the user gives an explicit new placement.
+ - For add operations, infer placement from the user prompt when possible.
+ - Layout may be expressed either as a relation to an existing object or as a 9-grid location.
+ - A 9-grid location means the object's center or main tabletop footprint should be placed in one of the nine regions of the table support region split in XY: center, front, back, left_center, right_center, left_front, right_front, left_back, right_back. It is not a grid on the object itself.
+ - For relative layout, use placement.type=relative_to_object, exact reference_object_id, and one of left_of, right_of, front_of, or back_of.
+ - For 9-grid layout, use placement.type=grid and one of: center, front, back, left_center, right_center, left_front, right_front, left_back, right_back.
+ - Do not compute the final relation closure yourself. The program will remove deleted ids, inherit replacement layout, normalize right/back, preserve unaffected relations, add new layout, and compute transitive closure.
+ - Do not invent a target object when the prompt is ambiguous. Put the ambiguity in unresolved.
+ - If an operation cannot be grounded to an existing target or reference object, keep it unresolved instead of guessing.
+ - The top-level key reason is required.
+ - Every generated object must include source_operation.
+ - Every operation must include reason and confidence.
+ - Every unresolved item must include reason.
+ - Output JSON only.
+
+ Example 1:
+ Existing scene objects:
+ [
+ {
+ "id": "interact_plastic_water_bottle_0",
+ "name": "plastic_water_bottle",
+ "description": "A clear plastic water bottle with a white cap and a green label."
+ },
+ {
+ "id": "interact_spiral_notebook_0",
+ "name": "spiral_notebook",
+ "description": "A spiral-bound notebook with a tan cover and black binding."
+ }
+ ]
+ User prompt:
+ replace the water bottle with a big red apple
+ Output:
+ {
+ "deleted_object_ids": ["interact_plastic_water_bottle_0"],
+ "generated_objects": [
+ {
+ "temp_id": "new_red_apple_0",
+ "name": "red_apple",
+ "description": "A large red apple with smooth glossy skin, a round body, and a short brown stem.",
+ "source_operation": "replace"
+ }
+ ],
+ "operations": [
+ {
+ "type": "replace",
+ "target_object_id": "interact_plastic_water_bottle_0",
+ "new_object_temp_id": "new_red_apple_0",
+ "placement": {
+ "type": "preserve_target",
+ "reference_object_id": "",
+ "relation": "",
+ "grid": ""
+ },
+ "reason": "The user explicitly replaces the water bottle with a new apple object.",
+ "confidence": 0.98
+ }
+ ],
+ "unresolved": [],
+ "reason": "Replace the water bottle and keep its original placement context."
+ }
+
+ Example 2:
+ Existing scene objects:
+ [
+ {
+ "id": "interact_spiral_notebook_0",
+ "name": "spiral_notebook",
+ "description": "A spiral-bound notebook with a tan cover and black binding."
+ }
+ ]
+ User prompt:
+ add a blue mug to the left front of the table
+ Output:
+ {
+ "deleted_object_ids": [],
+ "generated_objects": [
+ {
+ "temp_id": "new_blue_mug_0",
+ "name": "blue_mug",
+ "description": "A ceramic blue mug with a glossy finish, a rounded body, and a side handle.",
+ "source_operation": "add"
+ }
+ ],
+ "operations": [
+ {
+ "type": "add",
+ "target_object_id": "",
+ "new_object_temp_id": "new_blue_mug_0",
+ "placement": {
+ "type": "grid",
+ "reference_object_id": "",
+ "relation": "",
+ "grid": "left_front"
+ },
+ "reason": "The user asks to add a new mug in a specific table 9-grid region.",
+ "confidence": 0.96
+ }
+ ],
+ "unresolved": [],
+ "reason": "Add a new mug at the left_front region of the table support area."
+ }
+
+intent_user: |
+ User edit prompt:
+ ${prompt}
+
+ Existing scene objects:
+ ${scene_objects_json}
+
+ Produce the scene edit intent. Remember:
+ - deleted_object_ids includes both explicitly deleted objects and replaced old objects.
+ - generated_objects includes every new object needed by add or replace operations.
+ - operations must contain enough placement information for the program to update relations and 9-grid assignments.
+ - The required keys must all be present, including top-level reason, generated_objects[].source_operation, operations[].reason, operations[].confidence, and unresolved[].reason when unresolved is non-empty.
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml
new file mode 100644
index 000000000..d43609784
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/scene_intake.yaml
@@ -0,0 +1,332 @@
+name: scene_intake
+version: 1
+
+image_system: |
+
+ You are a careful 3D tabletop scene intake assistant for IMAGE input.
+
+
+
+ You will receive one image of a tabletop scene.
+ This is only the first-stage scene intake step:
+ - Extract the object categories and counts on the tabletop.
+ - Extract the visible table or tabletop region that carries the objects, using
+ the fixed output field named table.
+
+ Do not analyze object-object relations, grids, orientations, stacking,
+ inside/container relations, layout, pose, masks, bounding boxes, or
+ segmentation results.
+
+
+
+ - CRITICAL: Include EVERY visible object on the tabletop without omission. Do
+ not skip, ignore, or drop any object, no matter how small, blurry, partially
+ occluded, or unfamiliar it appears. An incomplete assets list is the most
+ severe error you can make.
+ - Output only real physical objects that can become 3D asset generation targets.
+ - Do not include the table or tabletop region in assets.
+ - assets is a list of object category groups, not a list of individual object
+ instances.
+ - name must be the most specific English, singular, canonical object class
+ supported by the input.
+ - Prefer a concrete small category over a broad category. For example, output
+ fork instead of utensil, paper_cup instead of container, toy_car instead of
+ toy, remote_control instead of handheld_device, and cereal_box instead of
+ box when those categories are supported by the input.
+ - Use a broad fallback name only when the specific object category cannot be
+ reasonably inferred.
+ - Prefer snake_case names, such as apple, banana, soccer_ball, coffee_mug.
+ - Treat multiple objects as one repeated asset group only when they are
+ effectively the same object type and can share the same name, the same
+ object-only description, and the same class_candidate list without losing
+ important visual identity.
+ - Never output two asset rows with the same name. If the same name would be
+ repeated, merge them into one row and increase count.
+ - If repeated instances are truly the same asset group, output exactly one
+ asset row and set count to the number of visible or described instances.
+ - If two objects need meaningfully different descriptions, names, or
+ class_candidate lists, they are not repeated instances. Output separate
+ asset rows with specific different names.
+ - Only merge objects when they can reasonably be found by the same segmentation
+ prompts from name, class_candidate, and description.
+ - Do not merge visually different subtypes under a broad name. For example,
+ paper_cup and popcorn_cup must be separate rows, not one cup row; snack_bag
+ and paper_bag must be separate rows; remote_control and phone must be
+ separate rows.
+ - Do not output instance IDs such as apple_0 or banana_0. Instance IDs will be
+ generated by code from name and count.
+ - Do not output extra fields such as source_text, source_image_path, image_path,
+ bbox, mask, or id.
+ - class_candidate must contain exactly five English, singular, canonical
+ object class names that could help later image detection or segmentation.
+ - class_candidate must prioritize specific small categories. The first item
+ must equal name. The next items should be specific plausible classes before
+ broader fallback classes.
+ - Do not replace a known small category with a broad category. If the object is
+ a fork, include fork first; broader classes such as utensil or cutlery may
+ appear only later as fallbacks.
+ - For image inputs, if the exact object category is uncertain, use
+ class_candidate to list likely categories from specific to broader, such as
+ remote_control, handheld_device, electronic_device, gadget, tool.
+
+
+
+ - table.name, table.description, table.complete_table_description,
+ table.class_candidate, and every asset.description must be non-empty.
+ - Descriptions are used to generate images and then 3D geometry.
+ - Write each image asset description as one detailed English sentence, normally
+ 18 to 35 words.
+ - Every description must describe a SINGLE STANDALONE OBJECT isolated on a
+ pure-white background. Do NOT mention any other object, the table, the scene,
+ the room, or any background context.
+ - Do NOT include any spatial, positional, or layout information such as
+ "sitting on the table", "placed in front of", "to the left of", "on a
+ surface", "on the tabletop", etc.
+ - When describing an object, first state what the object is, then describe its
+ visible texture, color, shape, material, surface finish, graphic markings,
+ openings, handles, caps, labels, and compound structure when visible.
+ - For composite objects, describe the visible parts and how they form one
+ standalone object, such as body, cap, lid, label, wrapper, handle, rim, base,
+ nozzle, straw, or attached accessory.
+ - Focus only on what the object itself looks like, not where it is or how it
+ relates to anything else.
+ - For IMAGE inputs, include ONLY information supported by the image.
+ Do NOT invent or embellish details not visible in the image. If a colour
+ is ambiguous, use a reasonable neutral description ("light-colored",
+ "dark-toned", "metallic").
+ - table.description must describe the actual visible table or tabletop region
+ as a standalone target. If the complete table is visible, describe that
+ physical table directly, including type, color, shape, material, and legs
+ when visible. If only a partial tabletop is visible, describe that visible
+ tabletop area directly.
+ - table.complete_table_description must describe a complete standalone table
+ asset for generation. If only a partial tabletop is visible, convert that
+ partial surface into a complete table description with matching color,
+ material, and texture.
+ - table.complete_table_description must always include a complete physical
+ table-like object, with a tabletop and a plausible support structure such as
+ legs, pedestal, frame, or tray body. It must not describe only a surface
+ plane, tabletop patch, texture, or support region.
+ - Do not write generic phrases such as "support surface", "tabletop", or
+ "surface" when table.name is a concrete object such as table, desk, tray,
+ counter, shelf, or floor. Use the concrete class in the description.
+ - For repeated instances, write one object-only description for the shared
+ category. Do not mention instance positions.
+ - If two objects require different descriptions, they must be separate asset
+ rows with distinct names.
+
+
+
+ - Do not output a table id. The code will set table.id to "table".
+ - The table field represents the scene table or tabletop target. table.name
+ must be the best visible class name for that target, such as table, desk,
+ dining_table, coffee_table, workbench, or tabletop.
+ - table.class_candidate must contain exactly five English, singular,
+ canonical class names for segmenting the support target. The first item must
+ equal table.name.
+ - For image inputs, set table.is_complete_visible_table to true only when a
+ mostly complete table or desk is visible and suitable as the final table
+ geometry source. "Mostly complete" means both the tabletop outline/shape is
+ mostly visible and the table/desk legs or support structure are mostly
+ visible.
+ - Set table.is_complete_visible_table to false when only a cropped tabletop
+ patch, partial table surface, or heavily occluded table is visible.
+ - Set table.is_complete_visible_table to false when the tabletop shape is not
+ mostly visible, when the legs/support structure are not visible or only
+ barely visible, or when the image only shows a surface plane.
+ - If table.is_complete_visible_table is false, table.description may describe
+ the visible partial tabletop, but table.complete_table_description must
+ describe a complete table with matching tabletop color, material, and
+ texture.
+ - If table.description describes only a visible surface or tabletop patch,
+ table.complete_table_description must rewrite it as a full table-like asset
+ with matching tabletop appearance plus plausible legs, pedestal, frame, or
+ support body.
+ - For image input with is_complete_visible_table=true ONLY: choose
+ table.object_coverage_percent from exactly one of these four values.
+ Think in terms of SPATIAL SPREAD, not pixel area: imagine drawing the
+ smallest rectangle that encloses ALL objects on the tabletop, then ask
+ what fraction of the table surface that rectangle covers. Even sparse
+ small objects can score high if they are spread across the whole table.
+ 10 (objects clustered in one small region, most of the table is bare),
+ 30 (objects spread across a noticeable portion but large bare areas remain),
+ 50 (objects reach roughly half the table extent in at least one direction),
+ 70 (objects span most of the table, even if gaps exist between them).
+ Do not output any other value.
+ - When is_complete_visible_table=false: OMIT the
+ object_coverage_percent field entirely. Do not include it in the output.
+
+
+
+ {
+ "table": {
+ "name": "table",
+ "description": "A rectangular wooden table with a brown top and four straight legs.",
+ "complete_table_description": "A complete rectangular wooden table with a brown top and four straight legs.",
+ "is_complete_visible_table": true,
+ "class_candidate": ["table", "dining_table", "desk", "wooden_table", "furniture"],
+ "object_coverage_percent": 25
+ },
+ "assets": [
+ {
+ "name": "plastic_water_bottle",
+ "description": "A clear plastic water bottle with a ribbed cylindrical body, transparent glossy surface, narrow neck, blue screw cap, and printed paper label.",
+ "class_candidate": ["plastic_water_bottle", "water_bottle", "plastic_bottle", "bottle", "drink_container"],
+ "count": 1
+ },
+ {
+ "name": "sports_bottle",
+ "description": "A matte dark sports bottle with a tapered body, textured grip band, rounded shoulder, flip-top cap, and solid opaque plastic construction.",
+ "class_candidate": ["sports_bottle", "water_bottle", "drink_bottle", "bottle", "container"],
+ "count": 1
+ }
+ ]
+ }
+
+
+
+ - The top-level object must contain only table and assets.
+ - table must contain only name, description, complete_table_description,
+ is_complete_visible_table, and class_candidate.
+ - Each asset must contain only name, description, class_candidate, and count.
+ - table.name must be a non-empty string.
+ - table.description must be a non-empty string.
+ - table.complete_table_description must be a non-empty string.
+ - table.is_complete_visible_table must be a boolean.
+ - table.class_candidate must be a list of exactly five non-empty strings, and
+ the first item must equal table.name.
+ - assets must be a list.
+ - Each asset.name must be a non-empty string.
+ - Each asset.description must be a non-empty string.
+ - Each asset.class_candidate must be a list of exactly five non-empty strings.
+ - Each asset.count must be an integer greater than or equal to 1.
+ - Output JSON only. Do not include markdown or explanations outside JSON.
+
+
+image_user: |
+ Extract tabletop objects and the visible support target from this image.
+
+verifier_system: |
+
+ You are a strict scene-intake verifier for tabletop object grouping.
+
+
+
+ You will receive an original tabletop input and a draft scene_intake JSON.
+ Verify and correct the draft so it follows the same scene_intake schema.
+
+ Your main job is to check:
+ - Whether asset groups are correctly merged or split.
+ - Whether each asset count matches the visible or described instance count.
+ - Whether each name is specific enough for later image segmentation.
+ - Whether table.name, table.description, table.complete_table_description,
+ table.is_complete_visible_table, and table.class_candidate describe the
+ actual table/tabletop target.
+ - For image inputs, independently re-check table.is_complete_visible_table
+ against the original image.
+ - Independently re-check that table.complete_table_description describes a
+ complete standalone table/desk/workbench/tray-like asset, not only a surface
+ plane, tabletop patch, texture, or support region.
+
+ Return the corrected scene_intake JSON. Do not return comments, diffs, or
+ explanations.
+
+
+
+ - CRITICAL: Do NOT remove any asset row from the draft assets list. Your job is
+ to check and correct counts, names, and class_candidate values — not to drop
+ objects. If an object exists in the draft, it must remain in the corrected
+ output. Only add new rows if objects were clearly missed.
+ - assets is a list of object category groups, not individual instances.
+ - Use count to represent repeated instances only when they can share the same
+ name, object-only description, and class_candidate list.
+ - If two objects need different descriptions, names, or class_candidate lists,
+ split them into separate asset rows with specific names.
+ - Never keep two asset rows with the same name. If they are truly repeated
+ instances, merge them and increase count. If they are not truly the same,
+ rename them into more specific different names.
+ - Do not merge visually different subtypes under a broad name. For example,
+ paper_cup and popcorn_cup must be separate rows, not one cup row.
+ - Prefer small, visually segmentable names such as fork, paper_cup,
+ popcorn_cup, soccer_ball, snack_bag, wooden_block.
+ - Avoid broad names such as object, item, utensil, container, cup, bag, toy,
+ box, or device when the input supports a more specific category.
+ - class_candidate must contain exactly five names; the first item must equal
+ name.
+ - table.class_candidate must contain exactly five names; the first item must
+ equal table.name.
+ - Preserve the fixed table field as the table/tabletop target.
+ - For image inputs, do not trust the draft value of
+ table.is_complete_visible_table. Judge it again from the attached original
+ image.
+ - For image inputs, table.is_complete_visible_table is true only if a mostly
+ complete table is visible and suitable as final table geometry. "Mostly
+ complete" means both the tabletop outline/shape is mostly visible and the
+ table/desk legs or support structure are mostly visible.
+ - If only a partial tabletop is visible, table.is_complete_visible_table must
+ be false and table.complete_table_description must describe a complete table
+ with matching tabletop color, material, and texture.
+ - If the table/desk legs or support structure are not visible, or if the
+ tabletop outline/shape is not mostly visible, table.is_complete_visible_table
+ must be false.
+ - table.complete_table_description must always be a complete physical
+ table-like asset description, including a tabletop and a plausible support
+ structure such as legs, pedestal, frame, or tray body. It must not describe
+ only "a surface", "a tabletop surface", "a plane", "a patch", or only a
+ material/texture.
+ - If the draft table.complete_table_description describes only a visible
+ partial surface, rewrite it into a complete table-like object with matching
+ tabletop color, material, and texture plus a plausible support structure.
+ - For image inputs, only count clearly visible target instances. If uncertain,
+ use the most conservative count supported by the image.
+ - For image input with is_complete_visible_table=true: independently
+ re-assess the tabletop coverage against the original image and pick
+ table.object_coverage_percent from exactly one of 10, 30, 50, 70.
+ Correct the draft value if the bucket does not match the visible
+ clutter density.
+ - When is_complete_visible_table is false: remove
+ object_coverage_percent from table entirely if it is present in the draft.
+
+
+
+ {
+ "table": {
+ "name": "table",
+ "description": "A rectangular wooden table with a brown top and four straight legs.",
+ "complete_table_description": "A complete rectangular wooden table with a brown top and four straight legs.",
+ "is_complete_visible_table": true,
+ "class_candidate": ["table", "dining_table", "desk", "wooden_table", "furniture"],
+ "object_coverage_percent": 30
+ },
+ "assets": [
+ {
+ "name": "plastic_water_bottle",
+ "description": "A clear plastic water bottle with a ribbed cylindrical body, transparent glossy surface, narrow neck, blue screw cap, and printed paper label.",
+ "class_candidate": ["plastic_water_bottle", "water_bottle", "plastic_bottle", "bottle", "drink_container"],
+ "count": 1
+ },
+ {
+ "name": "sports_bottle",
+ "description": "A matte dark sports bottle with a tapered body, textured grip band, rounded shoulder, flip-top cap, and solid opaque plastic construction.",
+ "class_candidate": ["sports_bottle", "water_bottle", "drink_bottle", "bottle", "container"],
+ "count": 1
+ }
+ ]
+ }
+
+
+
+ - The top-level object must contain only table and assets.
+ - table must contain only name, description, complete_table_description,
+ is_complete_visible_table, class_candidate, and optionally
+ object_coverage_percent (only when is_complete_visible_table is true).
+ - Each asset must contain only name, description, class_candidate, and count.
+ - Output JSON only. Do not include markdown or explanations outside JSON.
+
+
+verifier_image_user: |
+ Verify and correct this draft scene_intake JSON against the attached tabletop image.
+
+
+ $scene_intake_json
+
diff --git a/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml b/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml
new file mode 100644
index 000000000..22d33af32
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/data/unified_scene_gen.yaml
@@ -0,0 +1,225 @@
+name: unified_scene_gen
+version: 1
+
+up_down_flip_check_system: |
+
+ You are a careful 3D tabletop geometry orientation verifier.
+
+
+
+ You will receive:
+ - Image A: the original tabletop scene photo.
+ - Image B: one comparison image containing two fixed front-oblique
+ orthographic renders of generated 3D objects only. Each render has a
+ visible numeric label.
+
+ Your task is to choose the numbered generated render that has the correct
+ up/down orientation relative to the original photo.
+
+
+
+ - Choose selected_number=1 when candidate 1 better matches the original
+ photo's visible object tops and support-facing sides.
+ - Choose selected_number=2 when candidate 2 better matches the original
+ photo's visible object tops and support-facing sides.
+ - Do not request a yaw rotation around the vertical axis. This task is not
+ about left-right ordering or rotating the layout in the image plane; both
+ candidates have already been yaw-aligned by geometric scoring.
+ - The generated renders are not strict top views. They are slightly
+ front-oblique views so object tops and front/side faces may both be visible.
+ - Ignore the missing table/support in the candidate renders; it is
+ intentionally omitted.
+ - If the renders are ambiguous, symmetric, low quality, or insufficient to
+ distinguish up/down orientation, choose selected_number=1.
+ - confidence must be a number from 0 to 1.
+ - reason must be concise and explain the visual evidence.
+
+
+
+ {
+ "selected_number": 1,
+ "confidence": 0.72,
+ "reason": "Candidate 1 shows the visible tops of the objects more consistently with the original image."
+ }
+
+
+
+ - Output JSON only. Do not include markdown or explanations outside JSON.
+ - The JSON object must include all required keys: selected_number,
+ confidence, reason.
+ - selected_number must be exactly 1 or 2.
+
+
+up_down_flip_check_user: |
+ Compare the original scene photo with the numbered generated object-only
+ front-oblique comparison image.
+
+
+ Choose which generated render has the correct up/down orientation. Return
+ exactly one JSON object with:
+ - selected_number: 1 or 2
+ - confidence: number from 0 to 1
+ - reason: short string
+
+
+asset_metric_scale_system: |
+
+ You estimate plausible real-world tabletop object bounding-box dimensions
+ from semantic descriptions.
+
+
+
+ Given an object name and description, output one plausible real-world
+ bounding-box dimension in centimeters.
+
+
+
+ - The dimensions must be in centimeters.
+ - The order of the three dimensions does not matter; the program will match
+ shape proportions.
+ - Estimate the full real-world object bbox, not only the visible part.
+ - Use common tabletop object sizes when the description is generic.
+ - Prefer a slightly larger but still plausible tabletop size when uncertain.
+ - Use confidence to express semantic certainty, not visual certainty.
+ - Output JSON only. Do not include markdown or text outside JSON.
+
+
+
+ {
+ "bbox_dims_cm": [18.0, 8.0, 5.0],
+ "confidence": 0.72,
+ "reason": "Typical compact tabletop item size."
+ }
+
+
+asset_metric_scale_user: |
+ Estimate plausible real-world bounding-box dimensions for this object.
+
+
+ $object_name
+
+
+
+ $object_description
+
+
+ Return exactly one JSON object with:
+ - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+ - confidence: number from 0 to 1
+ - reason: short string
+
+image_metric_scale_system: |
+
+ You estimate plausible real-world tabletop object bounding-box dimensions
+ from a labeled scene image and object descriptions.
+
+
+
+ You will receive:
+ - One image with each object marked by a bounding box and its object name.
+ - One JSON list containing object_id, object_name, and object_description
+ for all objects.
+
+ For each object in the JSON list, output one plausible real-world
+ bounding-box dimension in centimeters.
+
+
+
+ - Output one entry for every object_id in the input JSON.
+ - Use the labeled image to understand the object category and relative
+ visible scale in the scene.
+ - Use object_name and object_description as semantic anchors.
+ - The dimensions must be in centimeters.
+ - The order of the three dimensions does not matter.
+ - Prefer a slightly larger but still plausible tabletop size when uncertain.
+ - Use confidence to express semantic certainty.
+ - Output JSON only. Do not include markdown or text outside JSON.
+
+
+
+ {
+ "object_scales": [
+ {
+ "object_id": "interact_cup_0",
+ "bbox_dims_cm": [8.0, 8.0, 12.0],
+ "confidence": 0.78,
+ "reason": "Typical tabletop cup size."
+ }
+ ]
+ }
+
+
+image_metric_scale_user: |
+ Estimate real-world dimensions for every object in the JSON below.
+
+
+ $objects_json
+
+
+ The attached image has bbox + name labels matching object_name. Return exactly
+ one JSON object with:
+ - object_scales: list of objects, one for every input object_id
+ - object_id: copied exactly from input
+ - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+ - confidence: number from 0 to 1
+ - reason: short string
+
+text_metric_scale_system: |
+
+ You estimate plausible real-world tabletop object bounding-box dimensions
+ from a full text scene prompt and object descriptions.
+
+
+
+ You will receive:
+ - The user's original scene text.
+ - One JSON list containing object_id, object_name, and object_description
+ for all objects.
+
+ For each object in the JSON list, output one plausible real-world
+ bounding-box dimension in centimeters.
+
+
+
+ - Output one entry for every object_id in the input JSON.
+ - Use the full scene text to infer intended object scale and context. For
+ example, a "small soccer ball on a table" should not be treated as a full
+ regulation soccer ball.
+ - Use object_name and object_description as semantic anchors.
+ - The dimensions must be in centimeters.
+ - The order of the three dimensions does not matter.
+ - Prefer a slightly larger but still plausible tabletop size when uncertain.
+ - Use confidence to express semantic certainty.
+ - Output JSON only. Do not include markdown or text outside JSON.
+
+
+
+ {
+ "object_scales": [
+ {
+ "object_id": "interact_small_soccer_ball_0",
+ "bbox_dims_cm": [6.0, 6.0, 6.0],
+ "confidence": 0.74,
+ "reason": "The scene text describes a small tabletop soccer ball."
+ }
+ ]
+ }
+
+
+text_metric_scale_user: |
+ Estimate real-world dimensions for every object in the JSON below.
+
+
+ $user_text
+
+
+
+ $objects_json
+
+
+ Return exactly one JSON object with:
+ - object_scales: list of objects, one for every input object_id
+ - object_id: copied exactly from input
+ - bbox_dims_cm: one slightly generous plausible size, three positive numbers in centimeters
+ - confidence: number from 0 to 1
+ - reason: short string
diff --git a/embodichain/gen_sim/prompt2scene/prompts/schemas.py b/embodichain/gen_sim/prompt2scene/prompts/schemas.py
new file mode 100644
index 000000000..4cebaacba
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/prompts/schemas.py
@@ -0,0 +1,559 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""JSON schemas for LLM structured-output calls across all workflows."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.spatial_relations import (
+ GRID_VALUE_LIST,
+ RELATION_VALUE_LIST,
+)
+
+__all__ = [
+ "FILTER_EXTRA_INSTANCES_JSON_SCHEMA",
+ "IMAGE_METRIC_SCALE_JSON_SCHEMA",
+ "SCENE_INTAKE_JSON_SCHEMA",
+ "SCENE_EDIT_INTENT_JSON_SCHEMA",
+ "SPATIAL_LAYOUT_VERIFIER_JSON_SCHEMA",
+ "SPATIAL_LAYOUT_JSON_SCHEMA",
+ "TEXT_RELATIONS_JSON_SCHEMA",
+ "UP_DOWN_FLIP_CHECK_JSON_SCHEMA",
+]
+
+
+SCENE_INTAKE_JSON_SCHEMA: dict[str, Any] = {
+ "title": "SceneIntakeModelOutput",
+ "description": (
+ "Objects and table information extracted from a text or image input."
+ ),
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "table": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": (
+ "Canonical English class name for the visible table "
+ "or tabletop target, such as table, desk, dining_table, "
+ "coffee_table, workbench, or tabletop."
+ ),
+ },
+ "description": {
+ "type": "string",
+ "minLength": 20,
+ "maxLength": 180,
+ "description": (
+ "One concise standalone appearance description of the "
+ "visible table or tabletop region."
+ ),
+ },
+ "complete_table_description": {
+ "type": "string",
+ "minLength": 20,
+ "maxLength": 220,
+ "description": (
+ "One concise standalone description of a complete table "
+ "asset for text-to-3D generation, matching the visible "
+ "tabletop color, material, and texture."
+ ),
+ },
+ "is_complete_visible_table": {
+ "type": "boolean",
+ "description": (
+ "For image input, whether a mostly complete table is "
+ "visible and suitable as the final table geometry source. "
+ "For text input, this should be false."
+ ),
+ },
+ "class_candidate": {
+ "type": "array",
+ "minItems": 5,
+ "maxItems": 5,
+ "description": (
+ "Exactly five likely class names for segmenting the "
+ "visible table or tabletop target."
+ ),
+ "items": {
+ "type": "string",
+ "minLength": 1,
+ },
+ },
+ "object_coverage_percent": {
+ "type": "integer",
+ "enum": [10, 30, 50, 70],
+ "description": (
+ "For image input with a complete visible table ONLY: "
+ "choose the closest coverage bucket for objects on the "
+ "tabletop: 10 (mostly empty, a few small objects), "
+ "30 (lightly cluttered), 50 (moderately cluttered), "
+ "70 (densely packed). Omit this field entirely for "
+ "text input or when is_complete_visible_table is false."
+ ),
+ },
+ },
+ "required": [
+ "name",
+ "description",
+ "complete_table_description",
+ "is_complete_visible_table",
+ "class_candidate",
+ ],
+ },
+ "assets": {
+ "type": "array",
+ "description": (
+ "Object category groups on or intended for the tabletop scene."
+ ),
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": (
+ "Canonical English object name, singular, "
+ "snake_case preferred."
+ ),
+ },
+ "description": {
+ "type": "string",
+ "minLength": 20,
+ "maxLength": 180,
+ "description": (
+ "One concise appearance description of the object for "
+ "image and 3D geometry generation."
+ ),
+ },
+ "class_candidate": {
+ "type": "array",
+ "minItems": 5,
+ "maxItems": 5,
+ "description": (
+ "Exactly five likely object class names for later "
+ "image detection or segmentation."
+ ),
+ "items": {
+ "type": "string",
+ "minLength": 1,
+ },
+ },
+ "count": {
+ "type": "integer",
+ "description": (
+ "Number of repeated instances in this object category "
+ "group. Only group objects that can share the same name, "
+ "description, and class_candidate list."
+ ),
+ "minimum": 1,
+ },
+ },
+ "required": ["name", "description", "class_candidate", "count"],
+ },
+ },
+ },
+ "required": ["table", "assets"],
+}
+
+
+FILTER_EXTRA_INSTANCES_JSON_SCHEMA: dict[str, Any] = {
+ "title": "FilterExtraImageInstancesOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "extra_instance_numbers": {
+ "type": "array",
+ "description": "1-based mask numbers that should be removed.",
+ "items": {"type": "integer", "minimum": 1},
+ },
+ "reason": {
+ "type": "string",
+ "description": "Brief reason for the removal decision.",
+ },
+ },
+ "required": ["extra_instance_numbers", "reason"],
+}
+
+SPATIAL_LAYOUT_JSON_SCHEMA: dict[str, Any] = {
+ "title": "ImageSpatialLayoutOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "anchor": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "asset_id": {"type": "string", "minLength": 1},
+ "grid": {
+ "type": "string",
+ "enum": GRID_VALUE_LIST,
+ },
+ "reason": {"type": "string"},
+ },
+ "required": ["asset_id", "grid", "reason"],
+ },
+ "x_order": {
+ "type": "array",
+ "description": "Asset-id groups ordered from left to right.",
+ "items": {
+ "type": "array",
+ "items": {"type": "string", "minLength": 1},
+ "minItems": 1,
+ },
+ "minItems": 1,
+ },
+ "y_order": {
+ "type": "array",
+ "description": "Asset-id groups ordered from front to back.",
+ "items": {
+ "type": "array",
+ "items": {"type": "string", "minLength": 1},
+ "minItems": 1,
+ },
+ "minItems": 1,
+ },
+ "asset_states": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": True,
+ "properties": {
+ "asset_id": {"type": "string", "minLength": 1},
+ "is_arbitrary_layout": {"type": "boolean"},
+ "reason": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Optional short explanation for debugging.",
+ },
+ },
+ "required": [
+ "asset_id",
+ "is_arbitrary_layout",
+ "reason",
+ ],
+ },
+ },
+ },
+ "required": ["anchor", "x_order", "y_order", "asset_states"],
+}
+
+SPATIAL_LAYOUT_VERIFIER_JSON_SCHEMA: dict[str, Any] = {
+ "title": "ImageSpatialLayoutVerifierOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "passed": {
+ "type": "boolean",
+ "description": "Whether the draft spatial layout is correct.",
+ },
+ "reason": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Concise verification reason.",
+ },
+ "corrected_layout": SPATIAL_LAYOUT_JSON_SCHEMA,
+ },
+ "required": ["passed", "reason", "corrected_layout"],
+}
+
+
+TEXT_RELATIONS_JSON_SCHEMA: dict[str, Any] = {
+ "title": "TextRelationsOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "object_relations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "subject": {"type": "string", "minLength": 1},
+ "relation": {
+ "type": "string",
+ "enum": RELATION_VALUE_LIST,
+ },
+ "object": {"type": "string", "minLength": 1},
+ "evidence": {"type": "string", "minLength": 1},
+ },
+ "required": ["subject", "relation", "object", "evidence"],
+ },
+ },
+ "table_constraints": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "asset": {"type": "string", "minLength": 1},
+ "grid": {
+ "type": "string",
+ "enum": GRID_VALUE_LIST,
+ },
+ "evidence": {"type": "string", "minLength": 1},
+ },
+ "required": ["asset", "grid", "evidence"],
+ },
+ },
+ "object_layouts": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "asset": {"type": "string", "minLength": 1},
+ "is_arbitrary_layout": {"type": "boolean"},
+ "reason": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Optional explanation for this unresolved item.",
+ },
+ },
+ "required": ["asset", "is_arbitrary_layout", "reason"],
+ },
+ },
+ },
+ "required": ["object_relations", "table_constraints", "object_layouts"],
+}
+
+
+UP_DOWN_FLIP_CHECK_JSON_SCHEMA: dict[str, Any] = {
+ "title": "AlignedUpDownFlipCheckOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "selected_number": {"type": "integer", "enum": [1, 2]},
+ "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
+ "reason": {"type": "string"},
+ },
+ "required": ["selected_number", "confidence", "reason"],
+}
+
+IMAGE_METRIC_SCALE_JSON_SCHEMA: dict[str, Any] = {
+ "title": "ImageMetricScaleEstimate",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "object_scales": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "object_id": {"type": "string"},
+ "bbox_dims_cm": {
+ "type": "array",
+ "minItems": 3,
+ "maxItems": 3,
+ "items": {
+ "type": "number",
+ "minimum": 1.0e-6,
+ },
+ },
+ "confidence": {
+ "type": "number",
+ "minimum": 0.0,
+ "maximum": 1.0,
+ },
+ "reason": {"type": "string"},
+ },
+ "required": ["object_id", "bbox_dims_cm", "confidence", "reason"],
+ },
+ },
+ },
+ "required": ["object_scales"],
+}
+
+SCENE_EDIT_INTENT_JSON_SCHEMA: dict[str, Any] = {
+ "title": "SceneEditIntentOutput",
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "deleted_object_ids": {
+ "type": "array",
+ "description": (
+ "Existing scene object ids that should be removed. This includes "
+ "objects removed by delete operations and objects replaced by new "
+ "generated objects."
+ ),
+ "items": {"type": "string", "minLength": 1},
+ },
+ "generated_objects": {
+ "type": "array",
+ "description": (
+ "New objects that must be generated by the text-to-geometry "
+ "simready pipeline for add or replace operations."
+ ),
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "temp_id": {
+ "type": "string",
+ "minLength": 1,
+ "description": (
+ "Temporary id used by this edit plan, such as "
+ "new_red_mug_0. It must not collide with existing ids."
+ ),
+ },
+ "name": {
+ "type": "string",
+ "minLength": 1,
+ "description": (
+ "Canonical English snake_case object name for "
+ "text-to-geometry."
+ ),
+ },
+ "description": {
+ "type": "string",
+ "minLength": 20,
+ "maxLength": 220,
+ "description": (
+ "Standalone appearance description used for "
+ "text-to-geometry simready generation."
+ ),
+ },
+ "source_operation": {
+ "type": "string",
+ "enum": ["add", "replace"],
+ },
+ },
+ "required": [
+ "temp_id",
+ "name",
+ "description",
+ "source_operation",
+ ],
+ },
+ },
+ "operations": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["delete", "replace", "add"],
+ },
+ "target_object_id": {
+ "type": "string",
+ "description": (
+ "Existing object id for delete/replace, or empty "
+ "string for pure add."
+ ),
+ },
+ "new_object_temp_id": {
+ "type": "string",
+ "description": (
+ "Generated object temp_id for add/replace, or empty "
+ "string for delete."
+ ),
+ },
+ "placement": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "preserve_target",
+ "random",
+ "relative_to_object",
+ "grid",
+ ],
+ },
+ "reference_object_id": {
+ "type": "string",
+ "description": (
+ "Existing object id used as a spatial "
+ "reference, or empty string if unused."
+ ),
+ },
+ "relation": {
+ "type": "string",
+ "enum": [
+ "",
+ "left_of",
+ "right_of",
+ "front_of",
+ "back_of",
+ ],
+ },
+ "grid": {
+ "type": "string",
+ "enum": [""] + GRID_VALUE_LIST,
+ },
+ },
+ "required": [
+ "type",
+ "reference_object_id",
+ "relation",
+ "grid",
+ ],
+ },
+ "reason": {"type": "string", "minLength": 1},
+ "confidence": {
+ "type": "number",
+ "minimum": 0.0,
+ "maximum": 1.0,
+ },
+ },
+ "required": [
+ "type",
+ "target_object_id",
+ "new_object_temp_id",
+ "placement",
+ "reason",
+ "confidence",
+ ],
+ },
+ },
+ "unresolved": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": False,
+ "properties": {
+ "query": {"type": "string", "minLength": 1},
+ "reason": {"type": "string", "minLength": 1},
+ "candidate_object_ids": {
+ "type": "array",
+ "items": {"type": "string", "minLength": 1},
+ },
+ },
+ "required": ["query", "reason", "candidate_object_ids"],
+ },
+ },
+ "reason": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Brief overall explanation of the edit interpretation.",
+ },
+ },
+ "required": [
+ "deleted_object_ids",
+ "generated_objects",
+ "operations",
+ "unresolved",
+ "reason",
+ ],
+}
diff --git a/embodichain/gen_sim/prompt2scene/utils/__init__.py b/embodichain/gen_sim/prompt2scene/utils/__init__.py
new file mode 100644
index 000000000..8378c49ac
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/__init__.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from . import log
+from embodichain.gen_sim.prompt2scene.utils.io import (
+ image_to_data_url,
+ relative_path,
+ write_json,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import (
+ log_api_request_start,
+ log_info,
+ log_warning,
+)
+
+__all__ = [
+ "log",
+ "log_api_request_start",
+ "log_info",
+ "log_warning",
+ "image_to_data_url",
+ "relative_path",
+ "write_json",
+]
diff --git a/embodichain/gen_sim/prompt2scene/utils/io.py b/embodichain/gen_sim/prompt2scene/utils/io.py
new file mode 100644
index 000000000..6057d1981
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/io.py
@@ -0,0 +1,66 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import base64
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = ["image_to_data_url", "relative_path", "write_json"]
+
+
+def relative_path(path: str | Path, root: Path) -> str:
+ """Return ``path`` relative to ``root`` when it is contained by it."""
+ resolved_path = Path(path)
+ try:
+ return str(resolved_path.relative_to(root))
+ except ValueError:
+ return str(path)
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+ """Write a JSON payload with prompt2scene's default formatting.
+
+ Args:
+ path: Output JSON file path.
+ payload: JSON-serializable dictionary payload.
+ """
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(
+ json.dumps(payload, indent=2, ensure_ascii=False) + "\n",
+ encoding="utf-8",
+ )
+ if not path.is_file():
+ raise FileNotFoundError(f"JSON output was not written: {path}")
+ log_info(f"Wrote JSON: {path}")
+
+
+def image_to_data_url(image_path: Path) -> str:
+ """Return a base64 data URL for a local image file."""
+ suffix_to_mime = {
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".png": "image/png",
+ ".webp": "image/webp",
+ ".gif": "image/gif",
+ }
+ mime_type = suffix_to_mime.get(image_path.suffix.lower(), "image/png")
+ encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
+ return f"data:{mime_type};base64,{encoded}"
diff --git a/embodichain/gen_sim/prompt2scene/utils/log.py b/embodichain/gen_sim/prompt2scene/utils/log.py
new file mode 100644
index 000000000..47bdfa445
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/utils/log.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+__all__ = ["log_api_request_start", "log_info", "log_warning"]
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [EmbodiChain %(levelname)s]: %(message)s",
+ datefmt="%H:%M:%S",
+)
+
+_LOGGER = logging.getLogger(__name__)
+_LOGGER.setLevel(logging.INFO)
+
+
+def _format_message(level: str, message: str) -> str:
+ _ = level
+ return f"Prompt2Scene: {message}"
+
+
+def log_info(message: str) -> None:
+ """Log an info message using the EmbodiChain log prefix."""
+ _LOGGER.info(_format_message("INFO", message))
+
+
+def log_warning(message: str) -> None:
+ """Log a warning message using the EmbodiChain log prefix."""
+ _LOGGER.warning(_format_message("WARNING", message))
+
+
+def log_api_request_start(
+ *,
+ step: str,
+ request: str,
+ attempt: int | None = None,
+ **details: Any,
+) -> None:
+ """Log the start of an API request with a stable key order."""
+ fields = [f"step={step}", f"request={request}"]
+ if attempt is not None:
+ fields.append(f"attempt={attempt}")
+ for key, value in details.items():
+ fields.append(f"{key}={value}")
+ log_info("api request start " + " ".join(fields))
diff --git a/embodichain/gen_sim/prompt2scene/workflows/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/__init__.py
new file mode 100644
index 000000000..b7bf52f2d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/__init__.py
@@ -0,0 +1,41 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ DEBUG_DIRNAME,
+ IMAGE_SEGMENTS_STEP,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ RAW_MODEL_OUTPUT_FILENAME,
+ SCENE_EDIT_STEP,
+ SCENE_INTAKE_STEP,
+ STEP_RESULT_FILENAME,
+ UNIFIED_SCENE_STEP,
+ WorkflowArtifactWriter,
+)
+
+__all__ = [
+ "DEBUG_DIRNAME",
+ "IMAGE_SEGMENTS_STEP",
+ "IMAGE_SPATIAL_RELATIONS_STEP",
+ "RAW_MODEL_OUTPUT_FILENAME",
+ "SCENE_EDIT_STEP",
+ "SCENE_INTAKE_STEP",
+ "STEP_RESULT_FILENAME",
+ "UNIFIED_SCENE_STEP",
+ "WorkflowArtifactWriter",
+]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py b/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py
new file mode 100644
index 000000000..70f9ee66e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/artifact_writer.py
@@ -0,0 +1,191 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.paths import (
+ DEBUG_DIRNAME,
+ IMAGE_SEGMENTS_STEP,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ RAW_MODEL_OUTPUT_FILENAME,
+ SCENE_EDIT_STEP,
+ SCENE_INTAKE_STEP,
+ STEP_RESULT_FILENAME,
+ UNIFIED_SCENE_GEN_STEP,
+ UNIFIED_SCENE_STEP,
+ debug_dir_path,
+ debug_round_dir_path,
+ next_debug_round_dir_path,
+ next_debug_round_name,
+ step_dir_path,
+ step_result_path,
+)
+
+__all__ = [
+ "DEBUG_DIRNAME",
+ "IMAGE_SEGMENTS_STEP",
+ "IMAGE_SPATIAL_RELATIONS_STEP",
+ "RAW_MODEL_OUTPUT_FILENAME",
+ "SCENE_EDIT_STEP",
+ "SCENE_INTAKE_STEP",
+ "STEP_RESULT_FILENAME",
+ "UNIFIED_SCENE_GEN_STEP",
+ "UNIFIED_SCENE_STEP",
+ "WorkflowArtifactWriter",
+ "write_debug_json",
+ "write_debug_round_json",
+ "write_next_raw_model_output",
+ "write_raw_model_output",
+ "write_step_result",
+]
+
+
+def write_step_result(
+ output_root: Path,
+ step_name: str,
+ payload: dict[str, Any],
+) -> Path:
+ path = step_result_path(output_root, step_name)
+ write_json(path, payload)
+ return path
+
+
+def write_debug_json(
+ output_root: Path,
+ step_name: str,
+ round_name: str,
+ filename: str,
+ payload: dict[str, Any],
+) -> Path:
+ path = debug_round_dir_path(output_root, step_name, round_name) / filename
+ write_json(path, payload)
+ return path
+
+
+def write_debug_round_json(
+ debug_round_dir: Path,
+ filename: str,
+ payload: dict[str, Any],
+) -> Path:
+ path = debug_round_dir / filename
+ write_json(path, payload)
+ return path
+
+
+def write_raw_model_output(
+ output_root: Path,
+ step_name: str,
+ round_name: str,
+ payload: dict[str, Any],
+) -> Path:
+ return write_debug_json(
+ output_root,
+ step_name,
+ round_name,
+ RAW_MODEL_OUTPUT_FILENAME,
+ payload,
+ )
+
+
+def write_next_raw_model_output(
+ output_root: Path,
+ step_name: str,
+ payload: dict[str, Any],
+ label: str | None = None,
+) -> Path:
+ round_name = next_debug_round_name(output_root, step_name, label)
+ return write_raw_model_output(output_root, step_name, round_name, payload)
+
+
+class WorkflowArtifactWriter:
+ def __init__(self, output_root: Path, step_name: str) -> None:
+ self._output_root = output_root
+ self._step_name = step_name
+
+ @property
+ def output_root(self) -> Path:
+ return self._output_root
+
+ @property
+ def step_name(self) -> str:
+ return self._step_name
+
+ @property
+ def step_dir(self) -> Path:
+ return step_dir_path(self._output_root, self._step_name)
+
+ @property
+ def debug_dir(self) -> Path:
+ return debug_dir_path(self._output_root, self._step_name)
+
+ @property
+ def result_path(self) -> Path:
+ return step_result_path(self._output_root, self._step_name)
+
+ def next_debug_round_name(self, label: str | None = None) -> str:
+ return next_debug_round_name(self._output_root, self._step_name, label)
+
+ def next_debug_round_dir(self, label: str | None = None) -> Path:
+ return next_debug_round_dir_path(self._output_root, self._step_name, label)
+
+ def debug_round_dir(self, round_name: str) -> Path:
+ return debug_round_dir_path(self._output_root, self._step_name, round_name)
+
+ def write_step_result(self, payload: dict[str, Any]) -> Path:
+ return write_step_result(self._output_root, self._step_name, payload)
+
+ def write_debug_round_json(
+ self,
+ *,
+ round_name: str,
+ filename: str,
+ payload: dict[str, Any],
+ ) -> Path:
+ return write_debug_round_json(
+ self.debug_round_dir(round_name),
+ filename=filename,
+ payload=payload,
+ )
+
+ def write_raw_model_output(
+ self,
+ *,
+ round_name: str,
+ payload: dict[str, Any],
+ ) -> Path:
+ return write_raw_model_output(
+ self._output_root,
+ self._step_name,
+ round_name,
+ payload,
+ )
+
+ def write_next_raw_model_output(
+ self,
+ *,
+ payload: dict[str, Any],
+ label: str | None = None,
+ ) -> Path:
+ return write_next_raw_model_output(
+ self._output_root,
+ self._step_name,
+ payload,
+ label=label,
+ )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py b/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py
new file mode 100644
index 000000000..15407e78e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/attempt_state.py
@@ -0,0 +1,30 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import TypedDict
+
+__all__ = ["AttemptState"]
+
+
+class AttemptState(TypedDict):
+ """Common retry/error fields for one model-call stage."""
+
+ attempt_count: int
+ max_attempts: int
+ last_error: str | None
+ errors: list[str]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/gym_export.py b/embodichain/gen_sim/prompt2scene/workflows/gym_export.py
new file mode 100644
index 000000000..974313069
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/gym_export.py
@@ -0,0 +1,762 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import math
+import shutil
+import time
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.workflows.paths import (
+ UNIFIED_SCENE_GEN_STEP,
+ PipelinePaths,
+)
+
+__all__ = ["export_gym_config"]
+
+_DEFAULT_OBJECT_ATTRS: dict[str, Any] = {
+ "mass": 0.01,
+ "contact_offset": 0.003,
+ "rest_offset": 0.001,
+ "restitution": 0.01,
+ "max_depenetration_velocity": 10.0,
+ "min_position_iters": 32,
+ "min_velocity_iters": 8,
+}
+
+_DEFAULT_TABLE_ATTRS: dict[str, Any] = {
+ "mass": 10.0,
+ "static_friction": 0.95,
+ "dynamic_friction": 0.9,
+ "restitution": 0.01,
+}
+
+_DEFAULT_OBJECT_MAX_CONVEX_HULL_NUM = 16
+_DEFAULT_TABLE_MAX_CONVEX_HULL_NUM = 8
+
+
+def _resolve_path(value: str, output_root: Path) -> Path:
+ path = Path(value).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
+
+
+def _read_json(path: Path) -> dict[str, Any]:
+ if path.is_dir():
+ raise IsADirectoryError(f"Expected JSON file but got directory: {path}")
+ if not path.is_file():
+ raise FileNotFoundError(f"JSON file not found: {path}")
+ with path.open("r", encoding="utf-8") as f:
+ data = json.load(f)
+ if not isinstance(data, dict):
+ raise ValueError(f"Expected JSON object at {path}")
+ return data
+
+
+def _resolve_table_fit_manifest_path(
+ *,
+ manifest_path_value: Any,
+ output_root: Path,
+ paths: PipelinePaths,
+) -> Path:
+ if not manifest_path_value:
+ raise FileNotFoundError("table_fit_to_clutter manifest_path is missing or empty")
+
+ resolved = _resolve_path(str(manifest_path_value), output_root)
+ if resolved.is_file():
+ return resolved
+
+ default_manifest = paths.table_fit_manifest
+ if default_manifest.is_file():
+ return default_manifest
+
+ if resolved.is_dir():
+ raise IsADirectoryError(
+ "table_fit_to_clutter manifest_path points to a directory, not a JSON "
+ f"file: value={manifest_path_value!r} resolved={resolved}"
+ )
+ raise FileNotFoundError(
+ "table_fit_to_clutter manifest_path does not point to a JSON file: "
+ f"value={manifest_path_value!r} resolved={resolved}"
+ )
+
+
+def _matrix_to_euler_xyz_deg(matrix: list[list[float]]) -> list[float]:
+ """Decompose a 3×3 or 4×4 rotation matrix into XYZ Euler angles (degrees)."""
+ m = np.asarray(matrix, dtype=np.float64)
+ r = m[:3, :3]
+ sy = math.sqrt(float(r[0, 0]) ** 2 + float(r[1, 0]) ** 2)
+ if sy > 1e-6:
+ x = math.atan2(float(r[2, 1]), float(r[2, 2]))
+ y = math.atan2(-float(r[2, 0]), sy)
+ z = math.atan2(float(r[1, 0]), float(r[0, 0]))
+ else:
+ x = math.atan2(-float(r[1, 2]), float(r[1, 1]))
+ y = math.atan2(-float(r[2, 0]), sy)
+ z = 0.0
+ return [math.degrees(x), math.degrees(y), math.degrees(z)]
+
+
+def _glb_to_sim_rotation() -> np.ndarray:
+ """Return the loader basis conversion from GLB Y-up to sim Z-up."""
+ return np.array(
+ [
+ [1.0, 0.0, 0.0],
+ [0.0, 0.0, -1.0],
+ [0.0, 1.0, 0.0],
+ ],
+ dtype=np.float64,
+ )
+
+
+def _glb_rotation_to_sim(rotation_matrix: list[list[float]]) -> list[list[float]]:
+ """Convert a GLB-space local rotation into simulation-space rotation."""
+ rot = np.asarray(rotation_matrix, dtype=np.float64)
+ if rot.shape == (4, 4):
+ rot = rot[:3, :3]
+ basis = _glb_to_sim_rotation()
+ return (basis @ rot @ basis.T).tolist()
+
+
+def _glb_scale_to_sim(scale: Sequence[float]) -> list[float]:
+ """Convert GLB-axis scale components to sim-axis body_scale components."""
+ values = [float(v) for v in scale]
+ if len(values) != 3:
+ raise ValueError("scale must have three components")
+ return [values[0], values[2], values[1]]
+
+
+def _decompose_affine_matrix(matrix_value: Any) -> tuple[list[float], list[float], list[float]]:
+ matrix = np.asarray(matrix_value, dtype=np.float64)
+ if matrix.shape != (4, 4):
+ raise ValueError("Expected a 4x4 affine matrix.")
+ linear = matrix[:3, :3]
+ scale = np.linalg.norm(linear, axis=0)
+ rotation = np.eye(3, dtype=np.float64)
+ for index in range(3):
+ if scale[index] > 1.0e-12:
+ rotation[:, index] = linear[:, index] / scale[index]
+ return (
+ matrix[:3, 3].tolist(),
+ _matrix_to_euler_xyz_deg(rotation.tolist()),
+ scale.tolist(),
+ )
+
+
+def _glb_max_z(glb_path: Path) -> float:
+ """Maximum height (Y in GLB, Z in simulation) of a mesh."""
+ import trimesh
+
+ scene = trimesh.load(glb_path, force="scene")
+ if isinstance(scene, trimesh.Trimesh):
+ mesh = scene
+ else:
+ dumped = scene.dump(concatenate=True)
+ mesh = (
+ dumped
+ if isinstance(dumped, trimesh.Trimesh)
+ else trimesh.util.concatenate(
+ [m for m in dumped if isinstance(m, trimesh.Trimesh)]
+ )
+ )
+ return float(np.asarray(mesh.bounds, dtype=np.float64)[1, 1]) # max Y
+
+
+def _rotated_aabb_offsets(
+ glb_path: Path,
+ rotation_matrix: list[list[float]] | None,
+ scale: float | Sequence[float] = 1.0,
+) -> tuple[float, float, float]:
+ """Compute the AABB shift caused by rotation + scale alone.
+
+ Loads the simready GLB, applies *rotation_matrix* and *scale_factor*
+ around the local origin (the AABB bottom-centre), and returns the XY
+ centre and minimum Z of the resulting AABB. These offsets are
+ subtracted from the fitted AABB bottom-centre to recover the true
+ world-space position of the simready local origin (the ``init_pos``
+ that the simulation expects).
+ """
+ import trimesh
+
+ scene = trimesh.load(glb_path, force="scene")
+ if isinstance(scene, trimesh.Trimesh):
+ mesh = scene
+ else:
+ dumped = scene.dump(concatenate=True)
+ mesh = (
+ dumped
+ if isinstance(dumped, trimesh.Trimesh)
+ else trimesh.util.concatenate(
+ [m for m in dumped if isinstance(m, trimesh.Trimesh)]
+ )
+ )
+ verts = mesh.vertices.copy()
+ if isinstance(scale, Sequence) and not isinstance(scale, (str, bytes)):
+ scale_array = np.asarray(list(scale), dtype=np.float64)
+ if scale_array.shape != (3,):
+ raise ValueError("scale must be a scalar or a 3-vector")
+ verts *= scale_array
+ else:
+ verts *= float(scale)
+ if rotation_matrix is not None:
+ rot = np.asarray(rotation_matrix, dtype=np.float64)
+ if rot.shape == (4, 4):
+ rot = rot[:3, :3]
+ verts = (rot @ verts.T).T
+ b = np.zeros((2, 3), dtype=np.float64)
+ b[0] = verts.min(axis=0)
+ b[1] = verts.max(axis=0)
+ return (
+ float(0.5 * (b[0, 0] + b[1, 0])), # AABB centre X → sim X
+ float(-0.5 * (b[0, 2] + b[1, 2])), # -centre Z → sim Y
+ float(b[0, 1]), # min Y → sim Z
+ )
+
+
+def _sim_world_xy_aabb(
+ glb_path: Path,
+ rotation_matrix: list[list[float]] | None,
+ scale: float | Sequence[float],
+ init_pos: Sequence[float],
+) -> dict[str, Any]:
+ """Project a transformed simready GLB onto the simulation XY plane."""
+ import trimesh
+
+ scene = trimesh.load(glb_path, force="scene")
+ if isinstance(scene, trimesh.Trimesh):
+ mesh = scene
+ else:
+ dumped = scene.dump(concatenate=True)
+ mesh = (
+ dumped
+ if isinstance(dumped, trimesh.Trimesh)
+ else trimesh.util.concatenate(
+ [m for m in dumped if isinstance(m, trimesh.Trimesh)]
+ )
+ )
+ verts = np.asarray(mesh.vertices.copy(), dtype=np.float64)
+ if isinstance(scale, Sequence) and not isinstance(scale, (str, bytes)):
+ scale_array = np.asarray(list(scale), dtype=np.float64)
+ if scale_array.shape != (3,):
+ raise ValueError("scale must be a scalar or a 3-vector")
+ verts *= scale_array
+ else:
+ verts *= float(scale)
+ if rotation_matrix is not None:
+ rot = np.asarray(rotation_matrix, dtype=np.float64)
+ if rot.shape == (4, 4):
+ rot = rot[:3, :3]
+ verts = (rot @ verts.T).T
+
+ init = np.asarray(list(init_pos), dtype=np.float64)
+ if init.shape != (3,):
+ raise ValueError("init_pos must have three components")
+ sim_xy = np.column_stack((verts[:, 0] + init[0], -verts[:, 2] + init[1]))
+ min_xy = sim_xy.min(axis=0)
+ max_xy = sim_xy.max(axis=0)
+ center_xy = 0.5 * (min_xy + max_xy)
+ size_xy = np.maximum(max_xy - min_xy, 0.0)
+ return {
+ "unit": "m",
+ "center_xy": center_xy.tolist(),
+ "aabb_xy": [min_xy.tolist(), max_xy.tolist()],
+ "size_xy": size_xy.tolist(),
+ }
+
+
+def _support_region_2d(table_fit_manifest: dict[str, Any]) -> dict[str, Any]:
+ support = table_fit_manifest.get("final_support_quad_centered") or {}
+ corners = np.asarray(support.get("corners_xy", []), dtype=np.float64)
+ if corners.shape != (4, 2):
+ return {"unit": "m", "center_xy": [], "aabb_xy": [], "size_xy": [], "corners_xy": []}
+ min_xy = corners.min(axis=0)
+ max_xy = corners.max(axis=0)
+ center_xy = np.asarray(
+ support.get("center_xy") or (0.5 * (min_xy + max_xy)).tolist(),
+ dtype=np.float64,
+ )
+ size_xy = np.asarray(
+ support.get("size_xy") or (max_xy - min_xy).tolist(),
+ dtype=np.float64,
+ )
+ return {
+ "unit": "m",
+ "center_xy": center_xy.tolist(),
+ "aabb_xy": [min_xy.tolist(), max_xy.tolist()],
+ "size_xy": size_xy.tolist(),
+ "corners_xy": corners.tolist(),
+ }
+
+
+def _write_scene_state(
+ *,
+ export_dir: Path,
+ config_path: Path,
+ table_desc: str,
+ table_support_region_2d: dict[str, Any],
+ object_states: list[dict[str, Any]],
+ source_snapshots: dict[str, str],
+) -> Path:
+ scene_state_dir = export_dir / "scene_state"
+ scene_state_dir.mkdir(parents=True, exist_ok=True)
+ plot_path = scene_state_dir / "topdown_2d.png"
+ state_path = scene_state_dir / "result.json"
+ state = {
+ "version": 1,
+ "coordinate_frame": {
+ "unit": "m",
+ "plane": "simulation_xy",
+ "x_axis": "simulation +X",
+ "y_axis": "simulation +Y",
+ "note": "2D values are top-down projections onto the simulation XY plane.",
+ },
+ "gym_config_path": str(config_path.relative_to(export_dir)),
+ "topdown_2d_plot_path": str(plot_path.relative_to(export_dir)),
+ "source_snapshots": source_snapshots,
+ "table": {
+ "id": "table",
+ "role": "background",
+ "description": table_desc,
+ "support_region_2d": table_support_region_2d,
+ },
+ "objects": object_states,
+ }
+ state_path.write_text(
+ json.dumps(state, indent=2, ensure_ascii=False) + "\n",
+ encoding="utf-8",
+ )
+ _render_scene_state_topdown(
+ support_region=table_support_region_2d,
+ objects=object_states,
+ output_path=plot_path,
+ )
+ return state_path
+
+
+def _copy_scene_source_snapshots(
+ *,
+ paths: PipelinePaths,
+ export_dir: Path,
+ scene_state_dir: Path,
+) -> dict[str, str]:
+ scene_state_dir.mkdir(parents=True, exist_ok=True)
+ snapshots: dict[str, str] = {}
+ sources = {
+ "unified_scene": paths.unified_scene_result,
+ "unified_scene_gen": paths.step_result(UNIFIED_SCENE_GEN_STEP),
+ }
+ for name, source in sources.items():
+ if not source.is_file():
+ continue
+ destination = scene_state_dir / f"{name}.json"
+ shutil.copy2(source, destination)
+ snapshots[name] = str(destination.relative_to(export_dir))
+ return snapshots
+
+
+def _render_scene_state_topdown(
+ *,
+ support_region: dict[str, Any],
+ objects: list[dict[str, Any]],
+ output_path: Path,
+) -> None:
+ import matplotlib
+
+ matplotlib.use("Agg")
+ import matplotlib.pyplot as plt
+ from matplotlib.patches import Polygon, Rectangle
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ fig, ax = plt.subplots(figsize=(9, 9))
+
+ data_points: list[np.ndarray] = []
+ corners = np.asarray(support_region.get("corners_xy", []), dtype=np.float64)
+ if corners.shape == (4, 2):
+ ax.add_patch(
+ Polygon(
+ corners,
+ closed=True,
+ facecolor=(0.18, 0.62, 0.32, 0.14),
+ edgecolor=(0.05, 0.38, 0.16, 1.0),
+ linewidth=2.0,
+ label="table support region",
+ )
+ )
+ data_points.append(corners)
+
+ for obj in objects:
+ footprint = obj.get("footprint_2d") or {}
+ aabb = np.asarray(footprint.get("aabb_xy", []), dtype=np.float64)
+ center = np.asarray(footprint.get("center_xy", []), dtype=np.float64)
+ if aabb.shape != (2, 2) or center.shape != (2,):
+ continue
+ size = np.maximum(aabb[1] - aabb[0], 0.0)
+ ax.add_patch(
+ Rectangle(
+ aabb[0],
+ size[0],
+ size[1],
+ facecolor=(0.25, 0.48, 0.95, 0.22),
+ edgecolor=(0.08, 0.20, 0.65, 1.0),
+ linewidth=1.5,
+ )
+ )
+ ax.plot(center[0], center[1], "o", color="#102a7a", markersize=4)
+ label = str(obj.get("id", "")).replace("interact_", "")
+ ax.text(
+ center[0],
+ center[1],
+ f"{label}\n({center[0]:.3f}, {center[1]:.3f})",
+ ha="center",
+ va="center",
+ fontsize=8,
+ color="black",
+ )
+ data_points.append(aabb)
+
+ if data_points:
+ all_points = np.vstack(data_points)
+ data_min = all_points.min(axis=0)
+ data_max = all_points.max(axis=0)
+ else:
+ data_min = np.array([-0.5, -0.5], dtype=np.float64)
+ data_max = np.array([0.5, 0.5], dtype=np.float64)
+ span = np.maximum(data_max - data_min, 1.0e-3)
+ padding = max(float(span.max()) * 0.18, 0.05)
+ x_limits = (float(data_min[0] - padding), float(data_max[0] + padding))
+ y_limits = (float(data_min[1] - padding), float(data_max[1] + padding))
+
+ ax.axhline(0.0, color="#303030", linewidth=1.2, alpha=0.75)
+ ax.axvline(0.0, color="#303030", linewidth=1.2, alpha=0.75)
+ ax.annotate(
+ "+X",
+ xy=(x_limits[1], 0.0),
+ xytext=(x_limits[1] - 0.08 * (x_limits[1] - x_limits[0]), 0.02),
+ arrowprops={"arrowstyle": "->", "color": "#303030", "lw": 1.4},
+ color="#303030",
+ )
+ ax.annotate(
+ "+Y",
+ xy=(0.0, y_limits[1]),
+ xytext=(0.02, y_limits[1] - 0.08 * (y_limits[1] - y_limits[0])),
+ arrowprops={"arrowstyle": "->", "color": "#303030", "lw": 1.4},
+ color="#303030",
+ )
+ ax.set_xlim(*x_limits)
+ ax.set_ylim(*y_limits)
+ ax.set_aspect("equal", adjustable="box")
+ ax.set_xlabel("X (m)")
+ ax.set_ylabel("Y (m)")
+ ax.set_title("Prompt2Scene Gym Export Top-Down 2D State")
+ ax.grid(True, linestyle=":", linewidth=0.6, alpha=0.45)
+ ax.legend(loc="upper right")
+ fig.tight_layout()
+ fig.savefig(output_path, dpi=180, facecolor="white")
+ plt.close(fig)
+
+
+def _build_object_manifest(
+ output_root: Path,
+ step_result: dict[str, Any],
+ table_fit_manifest: dict[str, Any],
+ aligned_by_id: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+ """Merge world_bc, rotation, scale into one per-object record.
+
+ Returns a dict keyed by object id, each value containing everything
+ needed to compute ``init_pos`` / ``init_rot`` / ``body_scale``.
+ """
+ objects_info = step_result.get("objects") or []
+
+ # index metric_scale by object id
+ metric_by_id: dict[str, float] = {}
+ for obj in objects_info:
+ oid = str(obj.get("id", ""))
+ if not oid:
+ continue
+ ms = obj.get("metric_scale")
+ sf = float(ms.get("scale_factor", 1.0)) if isinstance(ms, dict) else 1.0
+ metric_by_id[oid] = sf
+
+ # index world_aabb_bottom_center from table-fit manifest
+ world_bc_by_id: dict[str, list[float]] = {}
+ for e in table_fit_manifest.get("objects") or []:
+ eid = str(e.get("id", "")) if isinstance(e, dict) else ""
+ wbc = e.get("world_aabb_bottom_center") if isinstance(e, dict) else None
+ if eid and isinstance(wbc, list) and len(wbc) == 3:
+ world_bc_by_id[eid] = [float(v) for v in wbc]
+
+ consolidated: dict[str, Any] = {}
+ skipped_no_glb: list[str] = []
+ for obj in objects_info:
+ oid = str(obj.get("id", ""))
+ if not oid:
+ continue
+
+ source = obj.get("simready_geometry_path") or obj.get("mesh_path")
+ simready_path = _resolve_path(source or "", output_root)
+ if not simready_path.is_file():
+ skipped_no_glb.append(oid)
+ continue
+
+ description = str(obj.get("description") or obj.get("name") or "").strip()
+ scale_factor = metric_by_id.get(oid, 1.0)
+
+ aligned = aligned_by_id.get(oid)
+ rot_matrix: list[list[float]] | None = None
+ transform_scale: list[float] | None = None
+ if aligned:
+ raw = aligned.get("rotation_matrix")
+ if raw and isinstance(raw, list):
+ rot_matrix = raw
+ raw_scale = aligned.get("scale")
+ if isinstance(raw_scale, list) and len(raw_scale) == 3:
+ transform_scale = [float(v) for v in raw_scale]
+
+ wbc = world_bc_by_id.get(oid)
+
+ consolidated[oid] = {
+ "id": oid,
+ "description": description,
+ "simready_path": simready_path,
+ "scale_factor": scale_factor,
+ "transform_scale": transform_scale,
+ "rotation_matrix": rot_matrix,
+ "world_aabb_bottom_center": wbc,
+ }
+
+ if skipped_no_glb:
+ print(
+ " [WARN] object(s) skipped (simready GLB not found): "
+ + ", ".join(skipped_no_glb)
+ )
+ extra_in_manifest = set(world_bc_by_id) - set(consolidated)
+ if extra_in_manifest:
+ print(
+ " [WARN] object(s) in table-fit manifest but not in step_result: "
+ + ", ".join(sorted(extra_in_manifest))
+ )
+
+ return consolidated
+
+
+def export_gym_config(
+ output_root: Path,
+ *,
+ export_dir: Path | None = None,
+) -> Path:
+ """Export the unified-scene-gen result as a gym_config.json bundle.
+
+ Uses **simready** GLBs — transforms are written explicitly as
+ ``body_scale``, ``init_pos``, and ``init_rot``.
+ """
+ output_root = output_root.expanduser().resolve()
+ if export_dir is None:
+ export_dir = output_root / "gym_export"
+ else:
+ export_dir = export_dir.expanduser().resolve()
+ export_dir.mkdir(parents=True, exist_ok=True)
+
+ paths = PipelinePaths(output_root)
+
+ step_result = _read_json(paths.step_result(UNIFIED_SCENE_GEN_STEP))
+ table_fit = step_result.get("table_fit_to_clutter") or {}
+ if table_fit.get("status") != "ok":
+ raise RuntimeError(
+ "Cannot export gym_config because table_fit_to_clutter did not "
+ f"succeed: status={table_fit.get('status')!r} "
+ f"reason={table_fit.get('reason', '')}"
+ )
+ manifest_path_value = table_fit.get("manifest_path") or ""
+ table_fit_manifest = _read_json(
+ _resolve_table_fit_manifest_path(
+ manifest_path_value=manifest_path_value,
+ output_root=output_root,
+ paths=paths,
+ )
+ )
+
+ aligned_by_id: dict[str, dict[str, Any]] = {}
+ if paths.simready_to_aligned_manifest.is_file():
+ for item in _read_json(paths.simready_to_aligned_manifest).get("items", []) or []:
+ if isinstance(item, dict) and item.get("id"):
+ aligned_by_id[str(item["id"])] = item
+
+ object_manifest = _build_object_manifest(
+ output_root, step_result, table_fit_manifest, aligned_by_id
+ )
+
+ table_info = step_result.get("table") or {}
+ table_desc = str(
+ table_info.get("complete_table_description")
+ or table_info.get("description", "")
+ ).strip()
+
+ mesh_assets_dir = export_dir / "mesh_assets"
+ mesh_assets_dir.mkdir(parents=True, exist_ok=True)
+
+ table_simready = _resolve_path(
+ table_info.get("simready_geometry_path")
+ or table_info.get("mesh_path", ""),
+ output_root,
+ )
+ if not table_simready.is_file():
+ raise FileNotFoundError(f"Table simready GLB not found: {table_simready}")
+ table_dst = mesh_assets_dir / "table" / "table_0.glb"
+ table_dst.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(table_simready, table_dst)
+
+ table_fit_transform = table_fit_manifest.get("table_fit_transform")
+ if table_fit_transform:
+ table_init_pos, table_init_rot, table_body_scale = _decompose_affine_matrix(
+ table_fit_transform
+ )
+ else:
+ uniform_scale = 1.0
+ ts = table_fit_manifest.get("table_xy_scale")
+ if isinstance(ts, dict):
+ uniform_scale = float(ts.get("uniform_scale", 1.0))
+ table_init_pos = [0.0, 0.0, 0.0]
+ table_init_rot = [0.0, 0.0, 0.0]
+ table_body_scale = [uniform_scale, uniform_scale, 1.0]
+
+ rigid_objects: list[dict[str, Any]] = []
+ object_states: list[dict[str, Any]] = []
+
+ total = len(object_manifest)
+ for idx, (oid, om) in enumerate(object_manifest.items()):
+ safe_name = oid.replace("interact_", "").strip("_") or "object"
+ obj_dir = mesh_assets_dir / safe_name / oid
+ obj_dir.mkdir(parents=True, exist_ok=True)
+ object_dst = obj_dir / f"{oid}.glb"
+ shutil.copy2(om["simready_path"], object_dst)
+
+ sf = om["scale_factor"]
+ scale_glb = om.get("transform_scale") or [sf, sf, sf]
+ body_scale = _glb_scale_to_sim(scale_glb)
+
+ init_rot: list[float] = [0.0, 0.0, 0.0]
+ if om["rotation_matrix"] is not None:
+ init_rot = _matrix_to_euler_xyz_deg(
+ _glb_rotation_to_sim(om["rotation_matrix"])
+ )
+
+ ro = _rotated_aabb_offsets(
+ om["simready_path"], om["rotation_matrix"], scale_glb
+ )
+ wbc = om["world_aabb_bottom_center"]
+ if wbc is not None:
+ init_pos = [wbc[0] - ro[0], wbc[1] - ro[1], wbc[2] - ro[2]]
+ else:
+ raise ValueError(f"Missing table-fit world_aabb_bottom_center for {oid}")
+
+ rigid_objects.append(
+ {
+ "uid": oid,
+ "description": om["description"],
+ "shape": {
+ "shape_type": "Mesh",
+ "fpath": str(object_dst.relative_to(export_dir)),
+ "compute_uv": False,
+ },
+ "attrs": dict(_DEFAULT_OBJECT_ATTRS),
+ "body_type": "dynamic",
+ "init_pos": init_pos,
+ "init_rot": init_rot,
+ "body_scale": body_scale,
+ "max_convex_hull_num": _DEFAULT_OBJECT_MAX_CONVEX_HULL_NUM,
+ }
+ )
+ footprint_2d = _sim_world_xy_aabb(
+ om["simready_path"],
+ om["rotation_matrix"],
+ scale_glb,
+ init_pos,
+ )
+ object_states.append(
+ {
+ "id": oid,
+ "name": safe_name,
+ "role": "interact",
+ "description": om["description"],
+ "init_pos": init_pos,
+ "init_rot": init_rot,
+ "body_scale": body_scale,
+ "footprint_2d": footprint_2d,
+ }
+ )
+ wbc_flag = "wbc" if wbc is not None else "missing_wbc"
+ print(
+ f" [{idx+1}/{total}] [{oid}] {om['description']}"
+ f" pos={init_pos} rot={init_rot} scale={body_scale} src={wbc_flag}"
+ )
+
+ config = {
+ "id": f"Prompt2Scene-{int(time.time() * 1000)}-v0",
+ "max_episodes": 10,
+ "max_episode_steps": 300,
+ "env": {"events": {}, "observations": {}, "dataset": {}},
+ "robot": {},
+ "sensor": [],
+ "light": {},
+ "background": [
+ {
+ "uid": "table",
+ "description": table_desc,
+ "shape": {
+ "shape_type": "Mesh",
+ "fpath": str(table_dst.relative_to(export_dir)),
+ "compute_uv": False,
+ },
+ "attrs": dict(_DEFAULT_TABLE_ATTRS),
+ "body_scale": table_body_scale,
+ "body_type": "kinematic",
+ "init_pos": table_init_pos,
+ "init_rot": table_init_rot,
+ }
+ ],
+ "rigid_object": rigid_objects,
+ }
+
+ config_path = export_dir / "gym_config.json"
+ config_path.write_text(
+ json.dumps(config, indent=4, ensure_ascii=False) + "\n",
+ encoding="utf-8",
+ )
+ scene_state_dir = export_dir / "scene_state"
+ source_snapshots = _copy_scene_source_snapshots(
+ paths=paths,
+ export_dir=export_dir,
+ scene_state_dir=scene_state_dir,
+ )
+ scene_state_path = _write_scene_state(
+ export_dir=export_dir,
+ config_path=config_path,
+ table_desc=table_desc,
+ table_support_region_2d=_support_region_2d(table_fit_manifest),
+ object_states=object_states,
+ source_snapshots=source_snapshots,
+ )
+ print(f" scene_state={scene_state_path.relative_to(export_dir)}")
+
+ return config_path
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py
new file mode 100644
index 000000000..ab49ab724
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.graph import (
+ build_image_relations_graph,
+ run_image_relations,
+)
+
+__all__ = ["build_image_relations_graph", "run_image_relations"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py
new file mode 100644
index 000000000..ff67f3a03
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/graph.py
@@ -0,0 +1,189 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import (
+ OpenAICompatibleLLMCfg,
+ build_chat_model,
+)
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.nodes import (
+ call_vlm_filter_initial_segments_node,
+ call_vlm_spatial_layout_node,
+ normalize_asset_segments_node,
+ prepare_segmentation_input_node,
+ retry_missing_by_candidates_node,
+ segment_table_node,
+ segment_by_name_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.state import (
+ ImageRelationsState,
+)
+
+__all__ = ["build_image_relations_graph", "run_image_relations"]
+
+
+def route_after_filter_extra_instances(state: ImageRelationsState) -> str:
+ """Route to retry or continue after VLM extra-instance filtering."""
+ if state["last_error"] is None:
+ return "continue"
+ if state["attempt_count"] < state["max_attempts"]:
+ return "retry"
+ return "continue"
+
+
+def route_after_spatial_layout(state: ImageRelationsState) -> str:
+ """Route to retry or finish after spatial-layout extraction."""
+ if state["last_error"] is None:
+ return "end"
+ if state["attempt_count"] < state["max_attempts"]:
+ return "retry"
+ return "end"
+
+
+def build_image_relations_graph(llm: Any) -> Any:
+ """Build the fixed LangGraph image asset segmentation workflow."""
+ graph = StateGraph(ImageRelationsState)
+ graph.add_node("prepare_segmentation_input", prepare_segmentation_input_node)
+ graph.add_node("segment_by_name", segment_by_name_node)
+ graph.add_node(
+ "call_vlm_filter_initial_segments",
+ lambda state: call_vlm_filter_initial_segments_node(state, llm=llm),
+ )
+ graph.add_node(
+ "retry_missing_by_candidates",
+ lambda state: retry_missing_by_candidates_node(state, llm=llm),
+ )
+ graph.add_node("normalize_asset_segments", normalize_asset_segments_node)
+ graph.add_node(
+ "segment_table",
+ lambda state: segment_table_node(state, llm=llm),
+ )
+ graph.add_node(
+ "call_vlm_spatial_layout",
+ lambda state: call_vlm_spatial_layout_node(state, llm=llm),
+ )
+
+ graph.set_entry_point("prepare_segmentation_input")
+ graph.add_edge("prepare_segmentation_input", "segment_by_name")
+ graph.add_edge("segment_by_name", "call_vlm_filter_initial_segments")
+ graph.add_conditional_edges(
+ "call_vlm_filter_initial_segments",
+ route_after_filter_extra_instances,
+ {
+ "retry": "call_vlm_filter_initial_segments",
+ "continue": "retry_missing_by_candidates",
+ },
+ )
+ graph.add_edge("retry_missing_by_candidates", "normalize_asset_segments")
+ graph.add_edge("normalize_asset_segments", "segment_table")
+ graph.add_edge("segment_table", "call_vlm_spatial_layout")
+ graph.add_conditional_edges(
+ "call_vlm_spatial_layout",
+ route_after_spatial_layout,
+ {
+ "retry": "call_vlm_spatial_layout",
+ "end": END,
+ },
+ )
+ return graph.compile()
+
+
+def run_image_relations(
+ request: Prompt2SceneInput,
+ *,
+ scene_intake: SceneIntakeSpec,
+ llm_cfg: OpenAICompatibleLLMCfg,
+ output_root: Path,
+) -> ImageRelationSpec:
+ """Run image asset segmentation alignment for one prompt2scene request."""
+ llm = build_chat_model(llm_cfg)
+ graph = build_image_relations_graph(llm)
+ result = graph.invoke(
+ {
+ "request": request,
+ "scene_intake": scene_intake,
+ "output_root": output_root,
+ "segment_groups": [],
+ "raw_model_output": None,
+ "image_relations": None,
+ "attempt_count": 0,
+ "max_attempts": llm_cfg.max_attempts,
+ "last_error": None,
+ "errors": [],
+ }
+ )
+
+ image_relations = result.get("image_relations")
+ if (
+ image_relations is not None
+ and image_relations.status == "ok"
+ and image_relations.anchor is not None
+ ):
+ return image_relations
+ if image_relations is not None and image_relations.status == "ok":
+ error = format_result_missing_error(
+ "Image relations",
+ "spatial layout",
+ attempt_count=result.get("attempt_count", 0),
+ last_error=result.get("last_error"),
+ errors=result.get("errors", []),
+ )
+ log.log_warning(error)
+ raise RuntimeError(error)
+ if image_relations is not None:
+ failed_groups = [
+ group.to_manifest()
+ for group in image_relations.groups
+ if group.status != "ok"
+ ]
+ if (
+ image_relations.table_group is not None
+ and image_relations.table_group.status != "ok"
+ ):
+ failed_groups.append(image_relations.table_group.to_manifest())
+ error = (
+ "Image relations failed to align all image segments. "
+ f"Failed groups: {failed_groups}"
+ )
+ log.log_warning(error)
+ raise RuntimeError(error)
+
+ error = format_result_missing_error(
+ "Image relations",
+ "ImageRelationSpec",
+ attempt_count=result.get("attempt_count", 0),
+ last_error=result.get("last_error"),
+ errors=result.get("errors", []),
+ )
+ log.log_warning(error)
+ raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py
new file mode 100644
index 000000000..2c36868c2
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/nodes.py
@@ -0,0 +1,492 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.image_segment_filter import (
+ filter_segments_with_vlm,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageAssetSegment,
+ ImageRelationGroup,
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import InputKind
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ SPATIAL_LAYOUT_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+ log_api_request_start,
+ log,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ IMAGE_SEGMENTS_STEP,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.utils import (
+ append_unique,
+ apply_spatial_layout_output,
+ asset_bbox_label,
+ draw_labeled_bboxes,
+ expand_asset_ids,
+ filter_group_segments_with_artifacts,
+ merge_non_overlapping_segments,
+ prompt_text,
+ path_token,
+ require_image_path,
+ segment_area,
+ segment_prompt,
+ segments_from_response,
+ select_largest_table_segment,
+ table_segmentation_prompts,
+ verify_spatial_layout_output,
+ write_table_candidate_debug_image,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_spatial_layout_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.state import (
+ ImageRelationsState,
+)
+from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ call_structured_json_model_step,
+ is_model_output_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_attempt_error,
+)
+
+__all__ = [
+ "call_vlm_filter_extra_instances_node",
+ "call_vlm_filter_initial_segments_node",
+ "call_vlm_spatial_layout_node",
+ "normalize_asset_segments_node",
+ "prepare_segmentation_input_node",
+ "retry_missing_by_candidates_node",
+ "segment_table_node",
+ "segment_by_name_node",
+]
+
+
+def prepare_segmentation_input_node(state: ImageRelationsState) -> dict[str, object]:
+ """Prepare scene-intake asset groups for class-level segmentation."""
+ request = state["request"]
+ if request.input_kind != InputKind.IMAGE or request.image_path is None:
+ raise ValueError("Image relations requires an image input.")
+
+ segment_groups = []
+ for asset in state["scene_intake"].assets:
+ group = {
+ "name": asset.name,
+ "description": asset.description,
+ "asset_ids": expand_asset_ids(asset.id, asset.count),
+ "class_candidate": list(asset.class_candidate),
+ "segments": [],
+ "tried_prompts": [],
+ "debug_images": [],
+ "status": "pending",
+ "error": None,
+ "expected_count": asset.count,
+ }
+ segment_groups.append(group)
+ return {"segment_groups": segment_groups}
+
+
+def segment_by_name_node(state: ImageRelationsState) -> dict[str, object]:
+ """Run SAM3 once per object name."""
+ image_path = require_image_path(state)
+ segment_groups = []
+ for group in state["segment_groups"]:
+ prompt = prompt_text(group["name"])
+ response = segment_prompt(image_path=image_path, prompt=prompt)
+ group = dict(group)
+ group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+ group["segments"] = segments_from_response(
+ group=group,
+ response=response,
+ source_prompt=prompt,
+ )
+ segment_groups.append(group)
+ return {"segment_groups": segment_groups}
+
+
+def call_vlm_filter_extra_instances_node(
+ state: ImageRelationsState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Compatibility wrapper for the initial VLM segment filter."""
+ return call_vlm_filter_initial_segments_node(state, llm=llm)
+
+
+def call_vlm_filter_initial_segments_node(
+ state: ImageRelationsState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Ask VLM to remove wrong masks from initial name-based SAM3 output."""
+ image_path = require_image_path(state)
+ art = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+ return filter_segments_with_vlm(
+ llm=llm,
+ image_path=image_path,
+ step_name=IMAGE_SEGMENTS_STEP,
+ segment_groups=state["segment_groups"],
+ attempt_count=state["attempt_count"],
+ errors=state["errors"],
+ stage="initial",
+ next_debug_round_name=art.next_debug_round_name,
+ debug_round_dir=art.debug_round_dir,
+ write_debug_json=art.write_debug_round_json,
+ )
+def retry_missing_by_candidates_node(
+ state: ImageRelationsState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Use remaining class candidates to add missing segment instances."""
+ image_path = require_image_path(state)
+ artifact_writer = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+ segment_groups = []
+ for group in state["segment_groups"]:
+ group = dict(group)
+ segments = group["segments"]
+ expected_count = group["expected_count"]
+ confirmed_segments = [
+ segment
+ for existing_group in state["segment_groups"]
+ for segment in existing_group.get("segments", [])
+ ]
+ for candidate_name in group["class_candidate"][1:]:
+ if len(segments) >= expected_count:
+ break
+ prompt = prompt_text(candidate_name)
+ if prompt in group["tried_prompts"]:
+ continue
+ response = segment_prompt(image_path=image_path, prompt=prompt)
+ group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+ new_segments = segments_from_response(
+ group=group,
+ response=response,
+ source_prompt=prompt,
+ )
+ stage_label = f"fallback_{path_token(prompt)}"
+ new_segments = filter_group_segments_with_artifacts(
+ llm=llm,
+ image_path=image_path,
+ artifact_writer=artifact_writer,
+ group=group,
+ segments=new_segments,
+ stage=stage_label,
+ confirmed_segments=confirmed_segments,
+ )
+ segments = merge_non_overlapping_segments(
+ existing=segments,
+ incoming=new_segments,
+ limit=expected_count,
+ )
+ confirmed_segments = confirmed_segments + new_segments
+ if len(segments) < expected_count:
+ description_prompt = str(group.get("description") or "").strip()
+ if description_prompt and description_prompt not in group["tried_prompts"]:
+ response = segment_prompt(
+ image_path=image_path,
+ prompt=description_prompt,
+ )
+ group["tried_prompts"] = append_unique(
+ group["tried_prompts"],
+ description_prompt,
+ )
+ new_segments = segments_from_response(
+ group=group,
+ response=response,
+ source_prompt=description_prompt,
+ )
+ new_segments = filter_group_segments_with_artifacts(
+ llm=llm,
+ image_path=image_path,
+ artifact_writer=artifact_writer,
+ group=group,
+ segments=new_segments,
+ stage="fallback_description",
+ confirmed_segments=confirmed_segments,
+ )
+ segments = merge_non_overlapping_segments(
+ existing=segments,
+ incoming=new_segments,
+ limit=expected_count,
+ )
+ confirmed_segments = confirmed_segments + new_segments
+ group["segments"] = segments
+ segment_groups.append(group)
+ return {"segment_groups": segment_groups}
+
+
+def normalize_asset_segments_node(state: ImageRelationsState) -> dict[str, object]:
+ """Assign final segments to scene-intake asset IDs."""
+ image_path = require_image_path(state)
+ asset_segments: list[ImageAssetSegment] = []
+ relation_groups: list[ImageRelationGroup] = []
+ status = "ok"
+
+ for group in state["segment_groups"]:
+ expected_count = group["expected_count"]
+ segments = group["segments"]
+ group_status = "ok"
+ error = None
+ if len(segments) < expected_count:
+ group_status = "failed"
+ error = "missing_segments"
+ status = "failed"
+ elif len(segments) > expected_count:
+ group_status = "failed"
+ error = "extra_segments"
+ status = "failed"
+
+ relation_groups.append(
+ ImageRelationGroup(
+ name=group["name"],
+ expected_count=expected_count,
+ detected_count=len(segments),
+ status=group_status,
+ tried_prompts=list(group["tried_prompts"]),
+ asset_ids=list(group["asset_ids"]),
+ debug_images=list(group["debug_images"]),
+ error=error,
+ )
+ )
+
+ if group_status != "ok":
+ continue
+ for asset_id, segment in zip(group["asset_ids"], segments):
+ asset_segments.append(
+ ImageAssetSegment(
+ asset_id=asset_id,
+ name=group["name"],
+ segment_id=segment["segment_id"],
+ bbox_xyxy=list(segment["bbox_xyxy"]),
+ score=float(segment["score"]),
+ source_prompt=segment["source_prompt"],
+ mask_rle=segment.get("mask_rle"),
+ )
+ )
+
+ bbox_name_image_path = None
+ if status == "ok":
+ artifact_writer = WorkflowArtifactWriter(
+ state["output_root"],
+ IMAGE_SEGMENTS_STEP,
+ )
+ bbox_name_image_path = str(
+ draw_labeled_bboxes(
+ image_path=image_path,
+ boxes=[
+ {
+ "bbox_xyxy": segment.bbox_xyxy,
+ "label": asset_bbox_label(segment.asset_id),
+ }
+ for segment in asset_segments
+ ],
+ output_path=artifact_writer.step_dir / "asset_segments_bbox_name.png",
+ )
+ )
+
+ image_relations = ImageRelationSpec(
+ status=status,
+ image_path=str(image_path),
+ asset_segments=asset_segments,
+ groups=relation_groups,
+ bbox_name_image_path=bbox_name_image_path,
+ )
+ WorkflowArtifactWriter(
+ state["output_root"],
+ IMAGE_SEGMENTS_STEP,
+ ).write_step_result(image_relations.to_segmentation_manifest())
+ return {"image_relations": image_relations}
+
+
+def segment_table_node(
+ state: ImageRelationsState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Segment the table/support target after object segmentation is complete."""
+ image_relations = state["image_relations"]
+ if image_relations is None or image_relations.status != "ok":
+ return {}
+
+ image_path = require_image_path(state)
+ table = state["scene_intake"].table
+ artifact_writer = WorkflowArtifactWriter(state["output_root"], IMAGE_SEGMENTS_STEP)
+ group = {
+ "name": table.name,
+ "description": table.description,
+ "asset_ids": [table.id],
+ "class_candidate": list(table.class_candidate),
+ "segments": [],
+ "tried_prompts": [],
+ "debug_images": [],
+ "status": "pending",
+ "error": None,
+ "expected_count": 1,
+ }
+ segments: list[dict[str, Any]] = []
+
+ for prompt in table_segmentation_prompts(group):
+ if len(segments) >= 1:
+ break
+ response = segment_prompt(image_path=image_path, prompt=prompt)
+ group["tried_prompts"] = append_unique(group["tried_prompts"], prompt)
+ new_segments = segments_from_response(
+ group=group,
+ response=response,
+ source_prompt=prompt,
+ )
+ write_table_candidate_debug_image(
+ image_path=image_path,
+ artifact_writer=artifact_writer,
+ group=group,
+ segments=new_segments,
+ stage=f"table_{path_token(prompt)}",
+ )
+ selected_segment = select_largest_table_segment(new_segments)
+ if selected_segment is not None:
+ segments = [selected_segment]
+
+ group_status = "ok" if len(segments) == 1 else "failed"
+ error = None if group_status == "ok" else "missing_table_segment"
+ table_group = ImageRelationGroup(
+ name=group["name"],
+ expected_count=1,
+ detected_count=len(segments),
+ status=group_status,
+ tried_prompts=list(group["tried_prompts"]),
+ asset_ids=[table.id],
+ debug_images=list(group["debug_images"]),
+ error=error,
+ )
+ table_segment = None
+ if group_status == "ok":
+ segment = segments[0]
+ table_segment = ImageAssetSegment(
+ asset_id=table.id,
+ name=table.name,
+ segment_id=segment["segment_id"],
+ bbox_xyxy=list(segment["bbox_xyxy"]),
+ score=float(segment["score"]),
+ source_prompt=segment["source_prompt"],
+ mask_rle=segment.get("mask_rle"),
+ )
+
+ updated_image_relations = ImageRelationSpec(
+ status="ok" if group_status == "ok" else "failed",
+ image_path=image_relations.image_path,
+ asset_segments=image_relations.asset_segments,
+ groups=image_relations.groups,
+ table_segment=table_segment,
+ table_group=table_group,
+ bbox_name_image_path=image_relations.bbox_name_image_path,
+ anchor=image_relations.anchor,
+ x_order=image_relations.x_order,
+ y_order=image_relations.y_order,
+ asset_layouts=image_relations.asset_layouts,
+ )
+ artifact_writer.write_step_result(updated_image_relations.to_segmentation_manifest())
+ return {"image_relations": updated_image_relations}
+
+
+
+ artifact_writer.write_step_result(updated_image_relations.to_segmentation_manifest())
+ return {"image_relations": updated_image_relations}
+
+
+def call_vlm_spatial_layout_node(
+ state: ImageRelationsState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Ask VLM for object ordering, anchor grid, and per-object layout states."""
+ image_relations = state["image_relations"]
+ if image_relations is None or image_relations.status != "ok":
+ return {}
+ if image_relations.bbox_name_image_path is None:
+ raise ValueError("Image spatial layout requires bbox_name_image_path.")
+
+ attempt_count = state["attempt_count"] + 1
+ asset_ids = [segment.asset_id for segment in image_relations.asset_segments]
+ artifact_writer = WorkflowArtifactWriter(
+ state["output_root"],
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ )
+ messages = build_spatial_layout_messages(
+ bbox_name_image_path=Path(image_relations.bbox_name_image_path),
+ asset_ids=asset_ids,
+ )
+
+ try:
+ log_api_request_start(
+ step=IMAGE_SPATIAL_RELATIONS_STEP,
+ request="spatial_layout",
+ attempt=attempt_count,
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=SPATIAL_LAYOUT_JSON_SCHEMA,
+ messages=messages,
+ context="Image spatial layout",
+
+
+ attempt_count=attempt_count,
+
+
+ )
+ verifier_output = verify_spatial_layout_output(
+ llm=llm,
+ bbox_name_image_path=Path(image_relations.bbox_name_image_path),
+ asset_ids=asset_ids,
+ raw_model_output=raw_model_output,
+ attempt_count=attempt_count,
+ artifact_writer=artifact_writer,
+ )
+ verified_model_output = verifier_output["corrected_layout"]
+ updated_image_relations = apply_spatial_layout_output(
+ image_relations=image_relations,
+ raw_model_output=verified_model_output,
+ )
+ spatial_manifest = updated_image_relations.to_spatial_manifest()
+ spatial_manifest["spatial_layout_verifier"] = verifier_output
+ artifact_writer.write_step_result(spatial_manifest)
+ except Exception as exc:
+ if is_model_output_error(exc) or isinstance(exc, ValueError):
+ error = format_attempt_error(
+ "Image relations spatial layout", attempt_count, exc
+ )
+ log.log_warning(error)
+ return {
+ "attempt_count": attempt_count,
+ "last_error": error,
+ "errors": state["errors"] + [error],
+ }
+ raise
+ return {
+ "attempt_count": attempt_count,
+ "image_relations": updated_image_relations,
+ "last_error": None,
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py
new file mode 100644
index 000000000..91dc4583e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/schema.py
@@ -0,0 +1,169 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+__all__ = [
+ "ImageAnchor",
+ "ImageAssetLayout",
+ "ImageAssetSegment",
+ "ImageRelationGroup",
+ "ImageRelationSpec",
+]
+
+
+@dataclass(frozen=True)
+class ImageAssetSegment:
+ """Image segmentation result aligned to one scene-intake asset."""
+
+ asset_id: str
+ name: str
+ segment_id: str
+ bbox_xyxy: list[float]
+ score: float
+ source_prompt: str
+ mask_rle: dict[str, Any] | None = None
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the segment to JSON-safe data."""
+ return {
+ "asset_id": self.asset_id,
+ "name": self.name,
+ "segment_id": self.segment_id,
+ "bbox_xyxy": list(self.bbox_xyxy),
+ "score": self.score,
+ "source_prompt": self.source_prompt,
+ "mask_rle": self.mask_rle,
+ }
+
+
+@dataclass(frozen=True)
+class ImageRelationGroup:
+ """Segmentation alignment status for assets sharing one object name."""
+
+ name: str
+ expected_count: int
+ detected_count: int
+ status: str
+ tried_prompts: list[str] = field(default_factory=list)
+ asset_ids: list[str] = field(default_factory=list)
+ debug_images: list[str] = field(default_factory=list)
+ error: str | None = None
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the group to JSON-safe data."""
+ return {
+ "name": self.name,
+ "expected_count": self.expected_count,
+ "detected_count": self.detected_count,
+ "status": self.status,
+ "tried_prompts": list(self.tried_prompts),
+ "asset_ids": list(self.asset_ids),
+ "debug_images": list(self.debug_images),
+ "error": self.error,
+ }
+
+
+@dataclass(frozen=True)
+class ImageAnchor:
+ """Anchor object used to place relative ordering onto the table grid."""
+
+ asset_id: str
+ grid: str
+ reason: str = ""
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the anchor to JSON-safe data."""
+ return {
+ "asset_id": self.asset_id,
+ "grid": self.grid,
+ "reason": self.reason,
+ }
+
+
+@dataclass(frozen=True)
+class ImageAssetLayout:
+ """Support state for one image asset instance."""
+
+ asset_id: str
+ is_arbitrary_layout: bool
+ reason: str = ""
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the layout to JSON-safe data."""
+ return {
+ "asset_id": self.asset_id,
+ "is_arbitrary_layout": self.is_arbitrary_layout,
+ "reason": self.reason,
+ }
+
+
+@dataclass(frozen=True)
+class ImageRelationSpec:
+ """Image asset segmentation alignment and spatial relations."""
+
+ status: str
+ image_path: str
+ asset_segments: list[ImageAssetSegment]
+ groups: list[ImageRelationGroup]
+ table_segment: ImageAssetSegment | None = None
+ table_group: ImageRelationGroup | None = None
+ bbox_name_image_path: str | None = None
+ anchor: ImageAnchor | None = None
+ x_order: list[list[str]] = field(default_factory=list)
+ y_order: list[list[str]] = field(default_factory=list)
+ asset_layouts: list[ImageAssetLayout] = field(default_factory=list)
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the image relation spec to JSON-safe data."""
+ manifest = self.to_segmentation_manifest()
+ manifest.update(self.to_spatial_manifest())
+ return manifest
+
+ def to_segmentation_manifest(self) -> dict[str, Any]:
+ """Convert only the segmentation alignment result to JSON-safe data."""
+ return {
+ "image_path": self.image_path,
+ "bbox_name_image_path": self.bbox_name_image_path,
+ "asset_segments": [
+ segment.to_manifest() for segment in self.asset_segments
+ ],
+ "groups": [group.to_manifest() for group in self.groups],
+ "table_segment": (
+ self.table_segment.to_manifest() if self.table_segment else None
+ ),
+ "table_group": (
+ self.table_group.to_manifest() if self.table_group else None
+ ),
+ }
+
+ def to_spatial_manifest(self) -> dict[str, Any]:
+ """Convert only spatial relations and layout states to JSON-safe data."""
+ return {
+ "image_path": self.image_path,
+ "bbox_name_image_path": self.bbox_name_image_path,
+ "anchor": self.anchor.to_manifest() if self.anchor else None,
+ "spatial_order": {
+ "left_to_right": [list(group) for group in self.x_order],
+ "front_to_back": [list(group) for group in self.y_order],
+ },
+ "objects": [
+ layout.to_manifest() for layout in self.asset_layouts
+ ],
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py
new file mode 100644
index 000000000..598530058
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/state.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["ImageRelationsState"]
+
+
+class ImageRelationsState(AttemptState):
+ """LangGraph state for image asset segmentation alignment."""
+
+ request: Prompt2SceneInput
+ scene_intake: SceneIntakeSpec
+ output_root: Path
+ segment_groups: list[dict[str, Any]]
+ raw_model_output: dict[str, Any] | None
+ image_relations: ImageRelationSpec | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py b/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py
new file mode 100644
index 000000000..0644cea52
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/image_relations/utils.py
@@ -0,0 +1,464 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.agent_tools.clients.image_segmentation_client import (
+ ImageSegmentationClient,
+ ImageSegmentationError,
+ ImageSegmentationServerRequest,
+ ImageSegmentationServerResponse,
+ bbox_iou,
+ decode_rle_mask,
+ draw_labeled_bboxes,
+ draw_numbered_masks,
+ is_usable_segmentation_candidate,
+ sort_segments_by_bbox,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.image_segment_filter import (
+ filter_group_segments_with_vlm,
+)
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ SPATIAL_LAYOUT_JSON_SCHEMA,
+ SPATIAL_LAYOUT_VERIFIER_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageAnchor,
+ ImageAssetLayout,
+ ImageAssetSegment,
+ ImageRelationGroup,
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.spatial_relations import (
+ GRID_VALUES,
+ validate_exact_asset_id_coverage,
+)
+from embodichain.gen_sim.prompt2scene.utils import log_api_request_start, log
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ IMAGE_SEGMENTS_STEP,
+ IMAGE_SPATIAL_RELATIONS_STEP,
+ WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_spatial_layout_messages,
+ build_spatial_layout_verifier_messages,
+)
+from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ call_structured_json_model_step,
+)
+
+__all__ = [
+ "MAX_SEGMENT_RETRIES",
+ "OVERLAP_IOU_THRESHOLD",
+ "append_unique",
+ "apply_spatial_layout_output",
+ "asset_bbox_label",
+ "draw_labeled_bboxes",
+ "expand_asset_ids",
+ "filter_group_segments_with_artifacts",
+ "merge_non_overlapping_segments",
+ "parse_anchor",
+ "parse_asset_states",
+ "parse_order_groups",
+ "path_token",
+ "prompt_text",
+ "require_image_path",
+ "segment_area",
+ "segment_prompt",
+ "segments_from_response",
+ "select_largest_table_segment",
+ "sort_segments_by_bbox",
+ "table_segmentation_prompts",
+ "verify_spatial_layout_output",
+ "write_table_candidate_debug_image",
+]
+
+MAX_SEGMENT_RETRIES = 1
+OVERLAP_IOU_THRESHOLD = 0.5
+
+
+def require_image_path(state: dict[str, Any]) -> Path:
+ """Return the request image path or raise if the input is invalid."""
+ image_path = state["request"].image_path
+ if image_path is None:
+ raise ValueError("Image relations requires request.image_path.")
+ return image_path
+
+
+def prompt_text(name: str) -> str:
+ """Convert an asset name to a natural-language prompt."""
+ return name.replace("_", " ")
+
+
+def asset_bbox_label(asset_id: str) -> str:
+ """Convert an internal asset id into a display label."""
+ prefix = "interact_"
+ return asset_id[len(prefix) :] if asset_id.startswith(prefix) else asset_id
+
+
+def expand_asset_ids(asset_id: str, count: int) -> list[str]:
+ """Expand a grouped asset id into instance ids."""
+ return [f"{asset_id}_{index}" for index in range(count)]
+
+
+def path_token(value: str) -> str:
+ """Convert a label into a filesystem-safe token."""
+ token = "".join(character if character.isalnum() else "_" for character in value)
+ return token.strip("_")[:80] or "prompt"
+
+
+def append_unique(values: list[str], value: str) -> list[str]:
+ """Append a string only if it does not already exist in the list."""
+ return values if value in values else values + [value]
+
+
+def segment_prompt(
+ *,
+ image_path: Path,
+ prompt: str,
+) -> ImageSegmentationServerResponse:
+ """Call the segmentation server with a single prompt."""
+ client = ImageSegmentationClient()
+ log_api_request_start(
+ step=IMAGE_SEGMENTS_STEP,
+ request="sam3_segment",
+ prompt=prompt,
+ )
+ result = client.segment(
+ ImageSegmentationServerRequest(prompt=prompt, image_path=image_path),
+ max_retries=MAX_SEGMENT_RETRIES,
+ )
+ if isinstance(result, ImageSegmentationError):
+ log.log_warning(result.error_message)
+ raise RuntimeError(result.error_message)
+ return result
+
+
+def segments_from_response(
+ *,
+ group: dict[str, Any],
+ response: ImageSegmentationServerResponse,
+ source_prompt: str,
+) -> list[dict[str, Any]]:
+ """Convert segmentation server output into internal segment dicts."""
+ segments = []
+ for candidate in response.result.candidates:
+ if not is_usable_segmentation_candidate(candidate):
+ continue
+ segments.append(
+ {
+ "segment_id": f"{group['name']}_{len(segments)}",
+ "bbox_xyxy": list(candidate.bbox_xyxy),
+ "score": float(candidate.score),
+ "mask_rle": candidate.mask_rle,
+ "source_prompt": source_prompt,
+ }
+ )
+ return sort_segments_by_bbox(segments)
+
+
+def apply_spatial_layout_output(
+ *,
+ image_relations: ImageRelationSpec,
+ raw_model_output: dict[str, Any],
+) -> ImageRelationSpec:
+ """Apply VLM spatial-layout output to an image-relations spec."""
+ asset_ids = [segment.asset_id for segment in image_relations.asset_segments]
+ asset_id_set = set(asset_ids)
+
+ anchor = parse_anchor(raw_model_output.get("anchor"), asset_id_set=asset_id_set)
+ x_order = parse_order_groups(
+ raw_model_output.get("x_order"),
+ asset_ids=asset_ids,
+ field_name="x_order",
+ )
+ y_order = parse_order_groups(
+ raw_model_output.get("y_order"),
+ asset_ids=asset_ids,
+ field_name="y_order",
+ )
+ state_by_asset_id = parse_asset_states(
+ raw_model_output.get("asset_states"),
+ asset_ids=asset_ids,
+ )
+ asset_layouts = [
+ ImageAssetLayout(
+ asset_id=asset_id,
+ is_arbitrary_layout=state_by_asset_id[asset_id]["is_arbitrary_layout"],
+ reason=state_by_asset_id[asset_id]["reason"],
+ )
+ for asset_id in asset_ids
+ ]
+ return ImageRelationSpec(
+ status=image_relations.status,
+ image_path=image_relations.image_path,
+ asset_segments=image_relations.asset_segments,
+ groups=image_relations.groups,
+ table_segment=image_relations.table_segment,
+ table_group=image_relations.table_group,
+ bbox_name_image_path=image_relations.bbox_name_image_path,
+ anchor=anchor,
+ x_order=x_order,
+ y_order=y_order,
+ asset_layouts=asset_layouts,
+ )
+
+
+def verify_spatial_layout_output(
+ *,
+ llm: Any,
+ bbox_name_image_path: Path,
+ asset_ids: list[str],
+ raw_model_output: dict[str, Any],
+ attempt_count: int,
+ artifact_writer: WorkflowArtifactWriter,
+) -> dict[str, Any]:
+ """Verify and optionally rewrite spatial layout VLM output."""
+ messages = build_spatial_layout_verifier_messages(
+ bbox_name_image_path=bbox_name_image_path,
+ asset_ids=asset_ids,
+ draft_spatial_layout_json=json.dumps(
+ raw_model_output,
+ ensure_ascii=False,
+ indent=2,
+ ),
+ )
+ log_api_request_start(
+ step=IMAGE_SPATIAL_RELATIONS_STEP,
+ request="spatial_layout_verify",
+ attempt=attempt_count,
+ )
+ round_name = artifact_writer.next_debug_round_name("spatial_layout_verify")
+ verifier_output = call_structured_json_model_step(
+ llm=llm,
+ schema=SPATIAL_LAYOUT_VERIFIER_JSON_SCHEMA,
+ messages=messages,
+ context="Image spatial layout verifier",
+ attempt_count=attempt_count,
+ raw_output_writer=lambda payload: artifact_writer.write_debug_round_json(
+ round_name=round_name,
+ filename="raw_model_output.json",
+ payload=payload,
+ ),
+ )
+ artifact_writer.write_debug_round_json(
+ round_name=round_name,
+ filename="verifier_result.json",
+ payload=verifier_output,
+ )
+ corrected = verifier_output.get("corrected_layout")
+ if not isinstance(corrected, dict):
+ raise ValueError("spatial_layout_verifier.corrected_layout must be an object.")
+ return verifier_output
+
+
+def parse_anchor(raw_anchor: Any, *, asset_id_set: set[str]) -> ImageAnchor:
+ """Parse and validate the anchor entry."""
+ if not isinstance(raw_anchor, dict):
+ raise ValueError("anchor must be an object.")
+ asset_id = str(raw_anchor.get("asset_id") or "").strip()
+ grid = str(raw_anchor.get("grid") or "").strip()
+ reason = str(raw_anchor.get("reason") or "").strip()
+ if asset_id not in asset_id_set:
+ raise ValueError(f"anchor.asset_id is not a known asset: {asset_id!r}.")
+ if grid not in GRID_VALUES:
+ raise ValueError(f"anchor.grid is not valid: {grid!r}.")
+ return ImageAnchor(asset_id=asset_id, grid=grid, reason=reason)
+
+
+def parse_order_groups(
+ raw_order: Any,
+ *,
+ asset_ids: list[str],
+ field_name: str,
+) -> list[list[str]]:
+ """Parse ordered asset-id groups from VLM output."""
+ if not isinstance(raw_order, list) or not raw_order:
+ raise ValueError(f"{field_name} must be a non-empty list.")
+
+ groups: list[list[str]] = []
+ flattened: list[str] = []
+ for group_index, raw_group in enumerate(raw_order):
+ if not isinstance(raw_group, list) or not raw_group:
+ raise ValueError(f"{field_name}[{group_index}] must be a non-empty list.")
+ group: list[str] = []
+ for raw_asset_id in raw_group:
+ asset_id = str(raw_asset_id).strip()
+ group.append(asset_id)
+ flattened.append(asset_id)
+ groups.append(group)
+
+ validate_exact_asset_id_coverage(
+ values=flattened,
+ expected_asset_ids=asset_ids,
+ context=field_name,
+ )
+ return groups
+
+
+def parse_asset_states(
+ raw_asset_states: Any,
+ *,
+ asset_ids: list[str],
+) -> dict[str, dict[str, Any]]:
+ """Parse per-asset layout state annotations."""
+ if not isinstance(raw_asset_states, list):
+ raise ValueError("asset_states must be a list.")
+
+ state_by_asset_id: dict[str, dict[str, Any]] = {}
+ for state_index, raw_state in enumerate(raw_asset_states):
+ if not isinstance(raw_state, dict):
+ raise ValueError(f"asset_states[{state_index}] must be an object.")
+ asset_id = str(raw_state.get("asset_id") or "").strip()
+ is_arbitrary_layout = raw_state.get("is_arbitrary_layout")
+ reason = str(raw_state.get("reason") or "").strip()
+ if not isinstance(is_arbitrary_layout, bool):
+ raise ValueError(
+ f"asset_states[{state_index}].is_arbitrary_layout must be boolean."
+ )
+ if not reason:
+ raise ValueError(f"asset_states[{state_index}].reason must be non-empty.")
+ if asset_id in state_by_asset_id:
+ raise ValueError(f"asset_states has duplicate asset_id: {asset_id!r}.")
+ state_by_asset_id[asset_id] = {
+ "is_arbitrary_layout": is_arbitrary_layout,
+ "reason": reason,
+ }
+
+ validate_exact_asset_id_coverage(
+ values=list(state_by_asset_id),
+ expected_asset_ids=asset_ids,
+ context="asset_states",
+ )
+ return state_by_asset_id
+
+
+def table_segmentation_prompts(group: dict[str, Any]) -> list[str]:
+ """Return table/support segmentation prompts in object-style fallback order."""
+ prompts = [prompt_text(group["name"])]
+ for candidate_name in group["class_candidate"][1:]:
+ prompts.append(prompt_text(candidate_name))
+ description_prompt = str(group.get("description") or "").strip()
+ if description_prompt:
+ prompts.append(description_prompt)
+
+ unique_prompts: list[str] = []
+ for prompt in prompts:
+ if prompt and prompt not in unique_prompts:
+ unique_prompts.append(prompt)
+ return unique_prompts
+
+
+def write_table_candidate_debug_image(
+ *,
+ image_path: Path,
+ artifact_writer: WorkflowArtifactWriter,
+ group: dict[str, Any],
+ segments: list[dict[str, Any]],
+ stage: str,
+) -> None:
+ """Write table/support candidate mask debug image without VLM filtering."""
+ if not segments:
+ return
+ round_name = artifact_writer.next_debug_round_name(
+ label=f"{stage}_{group['name']}"
+ )
+ round_dir = artifact_writer.debug_round_dir(round_name)
+ debug_image_path = draw_numbered_masks(
+ image_path=image_path,
+ segments=segments,
+ output_path=round_dir / "mask.png",
+ )
+ debug_images = list(group.get("debug_images") or [])
+ if str(debug_image_path) not in debug_images:
+ debug_images.append(str(debug_image_path))
+ group["debug_images"] = debug_images
+
+
+def filter_group_segments_with_artifacts(
+ *,
+ llm: Any,
+ image_path: Path,
+ artifact_writer: WorkflowArtifactWriter,
+ group: dict[str, Any],
+ segments: list[dict[str, Any]],
+ stage: str,
+ confirmed_segments: list[dict[str, Any]] | None = None,
+) -> list[dict[str, Any]]:
+ """Filter one group while keeping workflow artifact handling out of nodes."""
+ round_name = artifact_writer.next_debug_round_name(
+ label=f"{stage}_{group['name']}"
+ )
+ return filter_group_segments_with_vlm(
+ llm=llm,
+ image_path=image_path,
+ step_name=IMAGE_SEGMENTS_STEP,
+ group=group,
+ segments=segments,
+ stage=stage,
+ debug_round_name=round_name,
+ debug_round_dir=artifact_writer.debug_round_dir(round_name),
+ write_debug_json=artifact_writer.write_debug_round_json,
+ confirmed_segments=confirmed_segments,
+ )
+
+
+def select_largest_table_segment(
+ segments: list[dict[str, Any]],
+) -> dict[str, Any] | None:
+ """Select the largest SAM3 table/support candidate without VLM filtering."""
+ if not segments:
+ return None
+ return max(segments, key=segment_area)
+
+
+def segment_area(segment: dict[str, Any]) -> float:
+ mask_rle = segment.get("mask_rle")
+ if mask_rle is not None:
+ try:
+ mask = decode_rle_mask(mask_rle).convert("L")
+ histogram = mask.histogram()
+ return float(sum(count for value, count in enumerate(histogram) if value))
+ except Exception:
+ pass
+ x1, y1, x2, y2 = segment["bbox_xyxy"]
+ return max(0.0, float(x2) - float(x1)) * max(0.0, float(y2) - float(y1))
+
+
+def merge_non_overlapping_segments(
+ *,
+ existing: list[dict[str, Any]],
+ incoming: list[dict[str, Any]],
+ limit: int,
+) -> list[dict[str, Any]]:
+ """Merge non-overlapping segments until a limit is reached."""
+ merged = list(existing)
+ for segment in sorted(
+ incoming, key=lambda item: float(item["score"]), reverse=True
+ ):
+ if len(merged) >= limit:
+ break
+ if all(
+ bbox_iou(segment["bbox_xyxy"], other["bbox_xyxy"])
+ < OVERLAP_IOU_THRESHOLD
+ for other in merged
+ ):
+ merged.append(segment)
+ return sort_segments_by_bbox(merged)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/paths.py b/embodichain/gen_sim/prompt2scene/workflows/paths.py
new file mode 100644
index 000000000..3a297ed7d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/paths.py
@@ -0,0 +1,215 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = [
+ "DEBUG_DIRNAME",
+ "IMAGE_SEGMENTS_STEP",
+ "IMAGE_SPATIAL_RELATIONS_STEP",
+ "RAW_MODEL_OUTPUT_FILENAME",
+ "SCENE_INTAKE_STEP",
+ "SCENE_EDIT_STEP",
+ "STEP_RESULT_FILENAME",
+ "UNIFIED_SCENE_GEN_STEP",
+ "UNIFIED_SCENE_STEP",
+ "PipelinePaths",
+ "debug_dir_path",
+ "debug_round_dir_path",
+ "next_debug_round_dir_path",
+ "next_debug_round_name",
+ "resolve_generated_path",
+ "step_dir_path",
+ "step_result_path",
+]
+
+STEP_RESULT_FILENAME = "result.json"
+DEBUG_DIRNAME = "debug"
+RAW_MODEL_OUTPUT_FILENAME = "raw_model_output.json"
+
+SCENE_INTAKE_STEP = "scene_intake"
+SCENE_EDIT_STEP = "scene_edit"
+IMAGE_SEGMENTS_STEP = "image_segments"
+IMAGE_SPATIAL_RELATIONS_STEP = "image_spatial_relations"
+UNIFIED_SCENE_STEP = "unified_scene"
+UNIFIED_SCENE_GEN_STEP = "unified_scene_gen"
+
+_DEBUG_ROUND_PATTERN = re.compile(r"^round_(\d+)(?:_|$)")
+
+
+def step_dir_path(output_root: Path, step_name: str) -> Path:
+ return output_root / step_name
+
+
+def step_result_path(output_root: Path, step_name: str) -> Path:
+ return step_dir_path(output_root, step_name) / STEP_RESULT_FILENAME
+
+
+def debug_dir_path(output_root: Path, step_name: str) -> Path:
+ return step_dir_path(output_root, step_name) / DEBUG_DIRNAME
+
+
+def debug_round_dir_path(
+ output_root: Path,
+ step_name: str,
+ round_name: str,
+) -> Path:
+ return debug_dir_path(output_root, step_name) / round_name
+
+
+def next_debug_round_name(
+ output_root: Path,
+ step_name: str,
+ label: str | None = None,
+) -> str:
+ debug_dir = debug_dir_path(output_root, step_name)
+ max_index = 0
+ if debug_dir.is_dir():
+ for entry in debug_dir.iterdir():
+ if not entry.is_dir():
+ continue
+ match = _DEBUG_ROUND_PATTERN.match(entry.name)
+ if match is not None:
+ max_index = max(max_index, int(match.group(1)))
+ name = f"round_{max_index + 1:03d}"
+ if label:
+ name = f"{name}_{_path_token(label)}"
+ return name
+
+
+def next_debug_round_dir_path(
+ output_root: Path,
+ step_name: str,
+ label: str | None = None,
+) -> Path:
+ return debug_round_dir_path(
+ output_root,
+ step_name,
+ next_debug_round_name(output_root, step_name, label),
+ )
+
+
+def resolve_generated_path(value: Any, output_root: Path) -> Path:
+ if not value:
+ return Path()
+ path = Path(str(value)).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root.expanduser().resolve() / path).resolve()
+
+
+def _path_token(value: str) -> str:
+ token = "".join(c if c.isalnum() else "_" for c in value)
+ return token.strip("_")[:80] or "round"
+
+
+@dataclass(frozen=True)
+class PipelinePaths:
+ output_root: Path
+
+ def __post_init__(self) -> None:
+ object.__setattr__(self, "output_root", self.output_root.expanduser().resolve())
+
+ @property
+ def scene_intake_dir(self) -> Path:
+ return self.output_root / SCENE_INTAKE_STEP
+
+ @property
+ def scene_edit_dir(self) -> Path:
+ return self.output_root / SCENE_EDIT_STEP
+
+ @property
+ def image_segments_dir(self) -> Path:
+ return self.output_root / IMAGE_SEGMENTS_STEP
+
+ @property
+ def image_spatial_relations_dir(self) -> Path:
+ return self.output_root / IMAGE_SPATIAL_RELATIONS_STEP
+
+ @property
+ def unified_scene_dir(self) -> Path:
+ return self.output_root / UNIFIED_SCENE_STEP
+
+ @property
+ def unified_scene_gen_dir(self) -> Path:
+ return self.output_root / UNIFIED_SCENE_GEN_STEP
+
+ def step_result(self, step: str) -> Path:
+ return step_result_path(self.output_root, step)
+
+ @property
+ def scene_intake_result(self) -> Path:
+ return self.step_result(SCENE_INTAKE_STEP)
+
+ @property
+ def image_segments_result(self) -> Path:
+ return self.step_result(IMAGE_SEGMENTS_STEP)
+
+ @property
+ def unified_scene_result(self) -> Path:
+ return self.step_result(UNIFIED_SCENE_STEP)
+
+ def resolve_scene_result(self, explicit_path: Path | None) -> Path:
+ if explicit_path is not None:
+ return explicit_path.expanduser().resolve()
+ result = self.unified_scene_result
+ if result.is_file():
+ return result
+ legacy = self.unified_scene_dir / "results.json"
+ return legacy if legacy.is_file() else result
+
+ @property
+ def gen_image_dir(self) -> Path:
+ return self.unified_scene_gen_dir / "image_gen"
+
+ @property
+ def gen_glb_dir(self) -> Path:
+ return self.unified_scene_gen_dir / "glb_gen"
+
+ @property
+ def gen_debug_dir(self) -> Path:
+ return self.unified_scene_gen_dir / "debug"
+
+ @property
+ def table_fit_dir(self) -> Path:
+ return self.gen_glb_dir / "table_fit_to_clutter"
+
+ @property
+ def simready_to_aligned_manifest(self) -> Path:
+ return self.gen_glb_dir / "simready_to_aligned_manifest.json"
+
+ @property
+ def table_fit_manifest(self) -> Path:
+ return self.table_fit_dir / "table_fit_to_clutter_manifest.json"
+
+ @property
+ def gym_export_dir(self) -> Path:
+ return self.output_root / "gym_export"
+
+ @property
+ def gym_config(self) -> Path:
+ return self.gym_export_dir / "gym_config.json"
+
+ def prepare_generation_dirs(self) -> tuple[Path, Path, Path]:
+ dirs = (self.gen_image_dir, self.gen_glb_dir, self.gen_debug_dir)
+ for d in dirs:
+ d.mkdir(parents=True, exist_ok=True)
+ return dirs
diff --git a/embodichain/gen_sim/prompt2scene/workflows/request.py b/embodichain/gen_sim/prompt2scene/workflows/request.py
new file mode 100644
index 000000000..8fcd65c3e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/request.py
@@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+
+__all__ = ["InputKind", "Prompt2SceneInput"]
+
+SUPPORTED_IMAGE_SUFFIXES: frozenset[str] = frozenset({".jpg", ".jpeg", ".png"})
+
+
+class InputKind(str, Enum):
+ """Supported prompt2scene input kinds."""
+
+ IMAGE = "image"
+ EDIT = "edit"
+
+
+@dataclass(frozen=True)
+class Prompt2SceneInput:
+ """Normalized prompt2scene input."""
+
+ input_kind: InputKind
+ output_root: Path
+ image_path: Path | None = None
+ prompt: str | None = None
+
+ @classmethod
+ def from_cli_args(
+ cls,
+ *,
+ image_path: Path | None,
+ prompt: str | None,
+ output_root: Path,
+ ) -> "Prompt2SceneInput":
+ """Create a prompt2scene input from CLI arguments.
+
+ Args:
+ image_path: Input image path, if image mode is selected.
+ prompt: Optional edit prompt.
+ output_root: Directory where prompt2scene outputs are written.
+
+ Returns:
+ Normalized prompt2scene input.
+
+ Raises:
+ FileNotFoundError: If the image input path does not exist.
+ ValueError: If the image path is invalid.
+ """
+ output_root = output_root.expanduser().resolve()
+ prompt_text = prompt.strip() if prompt is not None else None
+ if prompt_text == "":
+ prompt_text = None
+
+ if image_path is not None:
+ image_path = image_path.expanduser().resolve()
+ cls._validate_image_path(image_path)
+ return cls(
+ input_kind=InputKind.IMAGE,
+ image_path=image_path,
+ output_root=output_root,
+ prompt=prompt_text,
+ )
+
+ return cls(
+ input_kind=InputKind.EDIT,
+ output_root=output_root,
+ prompt=cls._validate_edit_only_prompt(prompt_text, output_root),
+ )
+
+ def to_manifest(self) -> dict[str, str]:
+ """Convert the input to a JSON-serializable manifest."""
+ manifest: dict[str, str] = {
+ "input_kind": self.input_kind.value,
+ "output_root": str(self.output_root),
+ }
+ if self.input_kind == InputKind.IMAGE:
+ image_path = self.image_path
+ manifest["image_path"] = str(image_path)
+ if self.prompt is not None:
+ manifest["prompt"] = self.prompt
+ return manifest
+
+ @staticmethod
+ def _validate_edit_only_prompt(prompt: str | None, output_root: Path) -> str:
+ if prompt is None:
+ raise ValueError(
+ "Provide --image or --prompt with an existing output_root."
+ )
+ scene_state = output_root / "gym_export" / "scene_state" / "result.json"
+ if not scene_state.is_file():
+ raise FileNotFoundError(
+ "Edit-only mode requires an existing scene state: "
+ f"{scene_state}"
+ )
+ return prompt
+
+ @staticmethod
+ def _validate_image_path(image_path: Path) -> None:
+ """Validate supported image input paths."""
+ if not image_path.exists():
+ raise FileNotFoundError(f"Image input not found: {image_path}")
+ if not image_path.is_file():
+ raise ValueError(f"Image input is not a file: {image_path}")
+ if image_path.suffix.lower() not in SUPPORTED_IMAGE_SUFFIXES:
+ raise ValueError(
+ "Image input must have one of these extensions: .jpg, .jpeg, .png"
+ )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_edit/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/__init__.py
new file mode 100644
index 000000000..addab9d30
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/__init__.py
@@ -0,0 +1,23 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.graph import (
+ run_scene_edit,
+)
+
+__all__ = ["run_scene_edit"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_edit/graph.py b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/graph.py
new file mode 100644
index 000000000..c831cc6c1
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/graph.py
@@ -0,0 +1,109 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import shutil
+from typing import TYPE_CHECKING
+
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ write_step_result,
+)
+from embodichain.gen_sim.prompt2scene.workflows.paths import SCENE_EDIT_STEP
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.nodes import (
+ analyze_scene_edit_intent_node,
+ generate_edit_assets_node,
+ optimize_edit_layout_node,
+ update_scene_files_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.schema import (
+ SceneEditRequest,
+ SceneEditResult,
+)
+
+__all__ = ["run_scene_edit"]
+
+if TYPE_CHECKING:
+ from embodichain.gen_sim.prompt2scene.llms import OpenAICompatibleLLMCfg
+
+
+def run_scene_edit(
+ request: SceneEditRequest,
+ *,
+ llm_cfg: OpenAICompatibleLLMCfg | None = None,
+) -> SceneEditResult:
+ """Run the scene edit workflow."""
+ output_root = request.output_root.expanduser().resolve()
+ scene_state_path = output_root / "gym_export" / "scene_state" / "result.json"
+ if not scene_state_path.is_file():
+ raise FileNotFoundError(
+ "Scene edit requires an existing exported scene state: "
+ f"{scene_state_path}"
+ )
+ output_dir = output_root / SCENE_EDIT_STEP
+ output_dir.mkdir(parents=True, exist_ok=True)
+ if llm_cfg is None:
+ raise ValueError("Scene edit requires an LLM config for intent analysis.")
+ from embodichain.gen_sim.prompt2scene.llms import build_chat_model
+
+ llm = build_chat_model(llm_cfg)
+
+ intent_analysis = analyze_scene_edit_intent_node(
+ request=request,
+ output_dir=output_dir,
+ llm=llm,
+ )
+ generated_assets = generate_edit_assets_node(
+ intent_analysis=intent_analysis,
+ output_dir=output_dir,
+ llm=llm,
+ )
+ layout_result = optimize_edit_layout_node(
+ request=request,
+ intent_analysis=intent_analysis,
+ generated_assets=generated_assets,
+ output_dir=output_dir,
+ )
+ file_updates = update_scene_files_node(
+ intent_analysis=intent_analysis,
+ generated_assets=generated_assets,
+ layout_result=layout_result,
+ output_dir=output_dir,
+ )
+
+ result = SceneEditResult(
+ status="ok" if file_updates.get("status") == "ok" else "partial",
+ prompt=request.prompt,
+ scene_state_path=scene_state_path,
+ reason=(
+ "Scene edit intent analysis, asset generation, layout optimization, "
+ "and gym_export file updates completed."
+ ),
+ steps={
+ "intent_analysis": intent_analysis,
+ "generated_assets": generated_assets,
+ "layout_optimization": layout_result,
+ "file_updates": file_updates,
+ },
+ )
+ resolved_intent = intent_analysis.get("resolved_intent")
+ if isinstance(resolved_intent, dict):
+ write_json(output_dir / "resolved_intent.json", resolved_intent)
+ write_step_result(output_root, SCENE_EDIT_STEP, result.to_manifest())
+ if request.cleanup_scene_edit_dir and output_dir.is_dir():
+ shutil.rmtree(output_dir)
+ return result
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_edit/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/nodes.py
new file mode 100644
index 000000000..ae3360d66
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/nodes.py
@@ -0,0 +1,328 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_scene_edit_intent_messages,
+ build_text_metric_scale_messages,
+)
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ IMAGE_METRIC_SCALE_JSON_SCHEMA,
+ SCENE_EDIT_INTENT_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.text_scene_metric_scale import (
+ estimate_text_scene_metric_scale,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+ log,
+ log_api_request_start,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_attempt_error,
+ format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.schema import (
+ SceneEditRequest,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_edit.utils import (
+ build_scene_edit_layout,
+ export_scene_edit_gym_state,
+ extract_current_grids,
+ extract_current_relations,
+ extract_scene_objects,
+ generate_scene_edit_object_assets,
+ load_json_object,
+ match_prompt_scene_objects,
+ resolve_scene_edit_intent,
+ scene_state_path,
+)
+
+__all__ = [
+ "analyze_scene_edit_intent_node",
+ "generate_edit_assets_node",
+ "optimize_edit_layout_node",
+ "update_scene_files_node",
+]
+
+
+def analyze_scene_edit_intent_node(
+ *,
+ request: SceneEditRequest,
+ output_dir: Path,
+ llm: Any,
+) -> dict[str, Any]:
+ """Analyze existing scene state plus user prompt into structured edit intent."""
+ state_path = scene_state_path(request.output_root)
+ if not state_path.is_file():
+ raise FileNotFoundError(
+ "Scene edit requires an existing exported scene state: "
+ f"{state_path}"
+ )
+ scene_state = load_json_object(state_path)
+ scene_objects = extract_scene_objects(scene_state)
+ current_relations = extract_current_relations(
+ output_root=request.output_root,
+ scene_state=scene_state,
+ )
+ current_grids = extract_current_grids(
+ output_root=request.output_root,
+ scene_state=scene_state,
+ )
+ messages = build_scene_edit_intent_messages(
+ prompt=request.prompt,
+ scene_objects=scene_objects,
+ current_relations=current_relations,
+ )
+ from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ StructuredModelCallError,
+ call_structured_json_model_step,
+ )
+
+ attempt_count = 0
+ max_attempts = 3
+ errors: list[str] = []
+ raw_model_output: dict[str, Any] | None = None
+ retry_messages = list(messages)
+ persist_raw_model_output = False
+ while attempt_count < max_attempts:
+ attempt_count += 1
+ try:
+ log_api_request_start(
+ step="scene_edit",
+ request="intent_analysis",
+ attempt=attempt_count,
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=SCENE_EDIT_INTENT_JSON_SCHEMA,
+ messages=retry_messages,
+ context="Scene edit intent",
+ attempt_count=attempt_count,
+ raw_output_writer=None,
+ )
+ break
+ except StructuredModelCallError as exc:
+ error = format_attempt_error("Scene edit intent", attempt_count, exc)
+ errors.append(error)
+ log.log_warning(error)
+ persist_raw_model_output = True
+ retry_messages = list(messages) + [
+ {
+ "role": "user",
+ "content": (
+ "The previous JSON output failed schema validation. "
+ f"Fix this exact error and output the full JSON again: {exc}"
+ ),
+ }
+ ]
+
+ if raw_model_output is None:
+ raise RuntimeError(
+ format_result_missing_error(
+ "Scene edit intent",
+ "SceneEditIntentOutput",
+ attempt_count=attempt_count,
+ last_error=errors[-1] if errors else None,
+ errors=errors,
+ )
+ )
+
+ resolved_intent = resolve_scene_edit_intent(
+ intent=raw_model_output,
+ scene_objects=scene_objects,
+ current_relations=current_relations,
+ current_grids=current_grids,
+ )
+ source_snapshots = scene_state.get("source_snapshots") or {}
+ object_matches = match_prompt_scene_objects(
+ prompt=request.prompt,
+ scene_state=scene_state,
+ )
+ analysis = {
+ "status": "ok",
+ "node": "analyze_scene_edit_intent",
+ "prompt": request.prompt,
+ "scene_state_path": str(state_path),
+ "source_snapshots": source_snapshots,
+ "scene_summary": {
+ "object_count": len(scene_objects),
+ "objects": scene_objects,
+ },
+ "current_relations": current_relations,
+ "current_grid_assignments": current_grids,
+ "prompt_object_matches": object_matches,
+ "llm_intent": raw_model_output,
+ "resolved_intent": resolved_intent,
+ }
+ if persist_raw_model_output:
+ analysis["debug"] = {"retry_errors": errors}
+ return analysis
+
+
+def generate_edit_assets_node(
+ *,
+ intent_analysis: dict[str, Any],
+ output_dir: Path,
+ llm: Any | None = None,
+) -> dict[str, Any]:
+ """Generate simready assets for add/replace objects in a scene edit."""
+ intent = intent_analysis.get("resolved_intent")
+ if not isinstance(intent, dict):
+ intent = {}
+ generated_objects = intent.get("generated_objects")
+ if not isinstance(generated_objects, list):
+ generated_objects = []
+ if not generated_objects:
+ return {
+ "status": "ok",
+ "node": "generate_edit_assets",
+ "input_intent_status": intent_analysis.get("status"),
+ "objects_to_generate": [],
+ "generated_assets": [],
+ "reason": "No new objects were requested by the edit intent.",
+ }
+ generation_result = generate_scene_edit_object_assets(
+ generated_objects=generated_objects,
+ output_root=output_dir.parent,
+ output_dir=output_dir,
+ )
+ generated_assets = generation_result.get("generated_assets", [])
+ if isinstance(generated_assets, list) and generated_assets:
+ metric_prompt_objects = [
+ {
+ "object_id": str(obj.get("id", "")),
+ "object_name": str(obj.get("name", "")),
+ "object_description": str(obj.get("description", "")),
+ }
+ for obj in generated_assets
+ ]
+ prompt_text = str(intent_analysis.get("prompt") or "")
+ metric_scale_result = estimate_text_scene_metric_scale(
+ object_results=generated_assets,
+ user_text=prompt_text,
+ messages=build_text_metric_scale_messages(
+ user_text=prompt_text,
+ objects_json=metric_prompt_objects,
+ ),
+ schema=IMAGE_METRIC_SCALE_JSON_SCHEMA,
+ output_dir=output_dir / "glb_gen" / "metric_scale",
+ output_root=output_dir.parent,
+ llm=llm,
+ step_name="scene_edit",
+ )
+ else:
+ metric_scale_result = {
+ "status": "skipped",
+ "reason": "no_generated_assets",
+ "objects": [],
+ }
+ result = {
+ "status": generation_result.get("status", "partial"),
+ "node": "generate_edit_assets",
+ "input_intent_status": intent_analysis.get("status"),
+ "objects_to_generate": generated_objects,
+ "generated_assets": generated_assets,
+ "object_count": generation_result.get("object_count", 0),
+ "metric_scale": metric_scale_result,
+ "reason": (
+ "Generated simready assets for scene-edit add/replace objects."
+ ),
+ }
+ return result
+
+
+def optimize_edit_layout_node(
+ *,
+ request: SceneEditRequest,
+ intent_analysis: dict[str, Any],
+ generated_assets: dict[str, Any],
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Load the previous 2D layout and optimize an edited scene layout."""
+ scene_state_value = intent_analysis.get("scene_state_path", "")
+ scene_state = load_json_object(Path(str(scene_state_value)))
+ resolved_intent = intent_analysis.get("resolved_intent")
+ if not isinstance(resolved_intent, dict):
+ resolved_intent = {}
+ generated_asset_items = generated_assets.get("generated_assets")
+ if not isinstance(generated_asset_items, list):
+ generated_asset_items = []
+ layout = build_scene_edit_layout(
+ scene_state=scene_state,
+ resolved_intent=resolved_intent,
+ generated_assets=generated_asset_items,
+ output_root=output_dir.parent,
+ optimize_new_objects_only=request.optimize_new_objects_only,
+ )
+ return {
+ "status": layout.get("status", "ok"),
+ "node": "optimize_edit_layout",
+ "existing_scene_state_path": scene_state_value,
+ "generated_asset_count": len(generated_asset_items),
+ "deleted_object_ids": layout.get("deleted_object_ids", []),
+ "support_region": layout.get("support_region", {}),
+ "layout_updates": layout.get("layout_updates", []),
+ "optimization": layout.get("optimization", {}),
+ "reason": (
+ "Loaded the previous scene_state 2D footprints, inherited replacement "
+ "object centers, computed generated-object XY sizes from simready GLBs, "
+ "and applied relation/grid-based local layout optimization."
+ ),
+ }
+
+
+def update_scene_files_node(
+ *,
+ intent_analysis: dict[str, Any],
+ generated_assets: dict[str, Any],
+ layout_result: dict[str, Any],
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Update gym_export outputs so future scene edits read the edited scene."""
+ scene_state_value = intent_analysis.get("scene_state_path", "")
+ scene_state = load_json_object(Path(str(scene_state_value)))
+ generated_asset_items = generated_assets.get("generated_assets")
+ if not isinstance(generated_asset_items, list):
+ generated_asset_items = []
+ layout_updates = layout_result.get("layout_updates")
+ if not isinstance(layout_updates, list):
+ layout_updates = []
+ export_result = export_scene_edit_gym_state(
+ output_root=output_dir.parent,
+ scene_state=scene_state,
+ generated_assets=generated_asset_items,
+ layout_updates=layout_updates,
+ output_dir=output_dir,
+ )
+ return {
+ "status": export_result.get("status", "ok"),
+ "node": "update_scene_files",
+ "updated_files": export_result.get("updated_files", []),
+ "reason": (
+ "Updated gym_export outputs from the edited scene layout, including "
+ "gym_config, scene_state/result.json, topdown_2d.png, and any new "
+ "simready mesh assets."
+ ),
+ "inputs": {
+ "intent_status": intent_analysis.get("status"),
+ "generated_assets_status": generated_assets.get("status"),
+ "layout_status": layout_result.get("status"),
+ },
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_edit/schema.py b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/schema.py
new file mode 100644
index 000000000..cda659b84
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/schema.py
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+__all__ = ["SceneEditRequest", "SceneEditResult"]
+
+
+@dataclass(frozen=True)
+class SceneEditRequest:
+ """Input for editing an existing generated scene."""
+
+ output_root: Path
+ prompt: str
+ cleanup_scene_edit_dir: bool = False
+ optimize_new_objects_only: bool = True
+
+
+@dataclass(frozen=True)
+class SceneEditResult:
+ """Structured result for the scene edit workflow skeleton."""
+
+ status: str
+ prompt: str
+ scene_state_path: Path
+ reason: str
+ steps: dict[str, Any]
+
+ def to_manifest(self) -> dict[str, Any]:
+ return {
+ "status": self.status,
+ "prompt": self.prompt,
+ "scene_state_path": str(self.scene_state_path),
+ "reason": self.reason,
+ "steps": self.steps,
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_edit/utils.py b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/utils.py
new file mode 100644
index 000000000..795d109c7
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_edit/utils.py
@@ -0,0 +1,1531 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+import math
+import re
+import shutil
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from embodichain.gen_sim.prompt2scene.agent_tools.managers.layout_manager import (
+ LayoutManager,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.text_asset_generation import (
+ generate_text_object_assets,
+)
+from embodichain.gen_sim.prompt2scene.utils.io import relative_path, write_json
+from embodichain.gen_sim.prompt2scene.workflows.gym_export import (
+ _glb_scale_to_sim,
+ _render_scene_state_topdown,
+)
+from embodichain.gen_sim.prompt2scene.workflows.paths import PipelinePaths
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.spatial_relations import (
+ transitive_relation_closure,
+)
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+
+__all__ = [
+ "build_scene_edit_layout",
+ "extract_current_grids",
+ "extract_current_relations",
+ "extract_scene_edit_support_region",
+ "extract_scene_object_footprints",
+ "extract_scene_objects",
+ "generate_scene_edit_object_assets",
+ "export_scene_edit_gym_state",
+ "load_json_object",
+ "match_prompt_scene_objects",
+ "resolve_scene_edit_intent",
+ "resolve_scene_state_snapshot_path",
+ "scene_state_path",
+ "tokenize_text",
+ "validate_scene_edit_intent",
+]
+
+
+def scene_state_path(output_root: Path) -> Path:
+ return output_root / "gym_export" / "scene_state" / "result.json"
+
+
+def load_json_object(path: Path) -> dict[str, Any]:
+ with path.open("r", encoding="utf-8") as f:
+ data = json.load(f)
+ if not isinstance(data, dict):
+ raise ValueError(f"Expected JSON object at {path}")
+ return data
+
+
+def extract_scene_objects(scene_state: dict[str, Any]) -> list[dict[str, str]]:
+ """Return the minimal object view used by the edit-intent LLM."""
+ objects: list[dict[str, str]] = []
+ for obj in scene_state.get("objects", []) or []:
+ if not isinstance(obj, dict):
+ continue
+ object_id = str(obj.get("id", "")).strip()
+ if not object_id:
+ continue
+ objects.append(
+ {
+ "id": object_id,
+ "name": str(obj.get("name", "")).strip(),
+ "description": str(obj.get("description", "")).strip(),
+ }
+ )
+ return objects
+
+
+def extract_scene_object_footprints(
+ scene_state: dict[str, Any],
+) -> dict[str, dict[str, Any]]:
+ """Return normalized object 2D footprints keyed by object id."""
+ footprints: dict[str, dict[str, Any]] = {}
+ for obj in scene_state.get("objects", []) or []:
+ if not isinstance(obj, dict):
+ continue
+ object_id = str(obj.get("id", "")).strip()
+ footprint = obj.get("footprint_2d")
+ if not object_id or not isinstance(footprint, dict):
+ continue
+ center_xy = footprint.get("center_xy")
+ aabb_xy = footprint.get("aabb_xy")
+ size_xy = footprint.get("size_xy")
+ if not (
+ isinstance(center_xy, list)
+ and len(center_xy) == 2
+ and isinstance(aabb_xy, list)
+ and len(aabb_xy) == 2
+ and all(isinstance(item, list) and len(item) == 2 for item in aabb_xy)
+ and isinstance(size_xy, list)
+ and len(size_xy) == 2
+ ):
+ continue
+ footprints[object_id] = {
+ "unit": str(footprint.get("unit", "m")).strip() or "m",
+ "center_xy": [float(value) for value in center_xy],
+ "aabb_xy": [
+ [float(value) for value in aabb_xy[0]],
+ [float(value) for value in aabb_xy[1]],
+ ],
+ "size_xy": [float(value) for value in size_xy],
+ }
+ return footprints
+
+
+def extract_scene_edit_support_region(scene_state: dict[str, Any]) -> dict[str, Any]:
+ """Return the table support-region 2D manifest from the previous scene."""
+ table = scene_state.get("table")
+ if not isinstance(table, dict):
+ return {"unit": "m", "center_xy": [], "aabb_xy": [], "size_xy": [], "corners_xy": []}
+ support_region = table.get("support_region_2d")
+ if not isinstance(support_region, dict):
+ return {"unit": "m", "center_xy": [], "aabb_xy": [], "size_xy": [], "corners_xy": []}
+ return support_region
+
+
+def resolve_scene_state_snapshot_path(
+ *,
+ output_root: Path,
+ scene_state: dict[str, Any],
+ snapshot_name: str,
+) -> Path | None:
+ """Resolve a snapshot path recorded in gym_export/scene_state/result.json."""
+ source_snapshots = scene_state.get("source_snapshots")
+ if not isinstance(source_snapshots, dict):
+ return None
+ snapshot_value = source_snapshots.get(snapshot_name)
+ if not isinstance(snapshot_value, str) or not snapshot_value:
+ return None
+ snapshot_path = Path(snapshot_value)
+ if snapshot_path.is_absolute():
+ return snapshot_path
+ return output_root / "gym_export" / snapshot_path
+
+
+def extract_current_relations(
+ *,
+ output_root: Path,
+ scene_state: dict[str, Any],
+) -> list[dict[str, str]]:
+ """Load canonical relations from the unified_scene snapshot if available."""
+ snapshot_path = resolve_scene_state_snapshot_path(
+ output_root=output_root,
+ scene_state=scene_state,
+ snapshot_name="unified_scene",
+ )
+ if snapshot_path is None or not snapshot_path.is_file():
+ return []
+ unified_scene = load_json_object(snapshot_path)
+ spatial = unified_scene.get("spatial")
+ if not isinstance(spatial, dict):
+ return []
+ relations = spatial.get("relations")
+ if not isinstance(relations, list):
+ return []
+
+ normalized: list[dict[str, str]] = []
+ for relation in relations:
+ if not isinstance(relation, dict):
+ continue
+ subject = str(relation.get("subject", "")).strip()
+ relation_name = str(relation.get("relation", "")).strip()
+ object_id = str(relation.get("object", "")).strip()
+ if not subject or not relation_name or not object_id:
+ continue
+ normalized.append(
+ {
+ "subject": subject,
+ "relation": relation_name,
+ "object": object_id,
+ "source": str(relation.get("source", "")).strip(),
+ }
+ )
+ return normalized
+
+
+def extract_current_grids(
+ *,
+ output_root: Path,
+ scene_state: dict[str, Any],
+) -> dict[str, str]:
+ """Load object 9-grid assignments from the unified_scene snapshot."""
+ snapshot_path = resolve_scene_state_snapshot_path(
+ output_root=output_root,
+ scene_state=scene_state,
+ snapshot_name="unified_scene",
+ )
+ if snapshot_path is None or not snapshot_path.is_file():
+ return {}
+ unified_scene = load_json_object(snapshot_path)
+ objects = unified_scene.get("objects")
+ if not isinstance(objects, list):
+ return {}
+
+ grids: dict[str, str] = {}
+ for obj in objects:
+ if not isinstance(obj, dict):
+ continue
+ object_id = str(obj.get("id", "")).strip()
+ grid = str(obj.get("grid", "") or "").strip()
+ if object_id and grid:
+ grids[object_id] = grid
+ return grids
+
+
+def resolve_scene_edit_intent(
+ *,
+ intent: dict[str, Any],
+ scene_objects: list[dict[str, str]],
+ current_relations: list[dict[str, str]],
+ current_grids: dict[str, str],
+) -> dict[str, Any]:
+ """Resolve LLM edit operations into program-computed relations and grids."""
+ intent = _normalize_scene_edit_intent_ids(
+ intent=intent,
+ scene_objects=scene_objects,
+ )
+ validate_scene_edit_intent(intent=intent, scene_objects=scene_objects)
+
+ operations = [op for op in intent.get("operations", []) if isinstance(op, dict)]
+ generated_objects = _normalize_generated_objects(
+ operations=operations,
+ generated_objects=[
+ obj for obj in intent.get("generated_objects", []) if isinstance(obj, dict)
+ ],
+ )
+ generated_ids = {
+ str(obj.get("temp_id", "")).strip()
+ for obj in generated_objects
+ if str(obj.get("temp_id", "")).strip()
+ }
+ deleted_ids = _string_set(intent.get("deleted_object_ids"), "deleted_object_ids")
+
+ replacement_map: dict[str, str] = {}
+ replacement_inherits: set[str] = set()
+ for operation in operations:
+ if operation.get("type") != "replace":
+ continue
+ target_id = str(operation.get("target_object_id", "")).strip()
+ new_id = str(operation.get("new_object_temp_id", "")).strip()
+ if not target_id or not new_id:
+ continue
+ replacement_map[target_id] = new_id
+ placement = operation.get("placement")
+ placement_type = (
+ str(placement.get("type", "")).strip()
+ if isinstance(placement, dict)
+ else ""
+ )
+ if placement_type in {"", "preserve_target"}:
+ replacement_inherits.add(target_id)
+
+ direct_relations: list[dict[str, str]] = []
+ for relation in current_relations:
+ subject = str(relation.get("subject", "")).strip()
+ object_id = str(relation.get("object", "")).strip()
+ relation_name = str(relation.get("relation", "")).strip()
+ mapped_subject = _map_relation_endpoint(
+ object_id=subject,
+ deleted_ids=deleted_ids,
+ replacement_map=replacement_map,
+ replacement_inherits=replacement_inherits,
+ )
+ mapped_object = _map_relation_endpoint(
+ object_id=object_id,
+ deleted_ids=deleted_ids,
+ replacement_map=replacement_map,
+ replacement_inherits=replacement_inherits,
+ )
+ if mapped_subject is None or mapped_object is None:
+ continue
+ if mapped_subject == mapped_object:
+ continue
+ direct_relations.append(
+ {
+ "subject": mapped_subject,
+ "relation": relation_name,
+ "object": mapped_object,
+ "source": (
+ "replacement_inherited"
+ if mapped_subject != subject or mapped_object != object_id
+ else "preserved"
+ ),
+ }
+ )
+
+ updated_grids: dict[str, str] = {}
+ for object_id, grid in current_grids.items():
+ if object_id in deleted_ids:
+ replacement_id = replacement_map.get(object_id)
+ if replacement_id and object_id in replacement_inherits:
+ updated_grids[replacement_id] = grid
+ continue
+ updated_grids[object_id] = grid
+
+ for operation in operations:
+ op_type = str(operation.get("type", "")).strip()
+ if op_type not in {"add", "replace"}:
+ continue
+ new_id = str(operation.get("new_object_temp_id", "")).strip()
+ if new_id not in generated_ids:
+ continue
+ placement = operation.get("placement")
+ if not isinstance(placement, dict):
+ continue
+ placement_type = str(placement.get("type", "")).strip()
+ if placement_type == "grid":
+ grid = str(placement.get("grid", "")).strip()
+ if grid:
+ updated_grids[new_id] = grid
+ elif placement_type == "relative_to_object":
+ reference_id = _map_reference_endpoint(
+ object_id=str(placement.get("reference_object_id", "")).strip(),
+ deleted_ids=deleted_ids,
+ replacement_map=replacement_map,
+ )
+ relation = _placement_relation_to_canonical(
+ new_object_id=new_id,
+ relation=str(placement.get("relation", "")).strip(),
+ reference_object_id=reference_id or "",
+ )
+ if relation is not None:
+ direct_relations.append({**relation, "source": "new_prompt"})
+
+ return {
+ "deleted_object_ids": sorted(deleted_ids),
+ "generated_objects": generated_objects,
+ "operations": operations,
+ "updated_relations": _close_relations_with_sources(direct_relations),
+ "updated_grid_assignments": dict(sorted(updated_grids.items())),
+ "unresolved": intent.get("unresolved", []),
+ "reason": intent.get("reason", ""),
+ }
+
+
+def tokenize_text(value: str) -> set[str]:
+ return {
+ token
+ for token in re.split(r"[^a-zA-Z0-9]+", value.lower())
+ if len(token) >= 2
+ }
+
+
+def match_prompt_scene_objects(
+ *,
+ prompt: str,
+ scene_state: dict[str, Any],
+) -> list[dict[str, Any]]:
+ """Return rough object candidates mentioned by the edit prompt."""
+ prompt_tokens = tokenize_text(prompt)
+ matches: list[dict[str, Any]] = []
+ for obj in scene_state.get("objects", []) or []:
+ if not isinstance(obj, dict):
+ continue
+ text = " ".join(
+ str(obj.get(key, ""))
+ for key in ("id", "name", "description")
+ )
+ object_tokens = tokenize_text(text.replace("_", " "))
+ overlap = sorted(prompt_tokens & object_tokens)
+ if not overlap:
+ continue
+ score = len(overlap) / max(len(object_tokens), 1)
+ matches.append(
+ {
+ "id": obj.get("id", ""),
+ "name": obj.get("name", ""),
+ "description": obj.get("description", ""),
+ "matched_tokens": overlap,
+ "score": score,
+ "footprint_2d": obj.get("footprint_2d"),
+ }
+ )
+ return sorted(matches, key=lambda item: float(item["score"]), reverse=True)
+
+
+def generate_scene_edit_object_assets(
+ *,
+ generated_objects: list[dict[str, Any]],
+ output_root: Path,
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Generate simready assets for scene-edit add/replace objects."""
+ image_gen_dir = output_dir / "image_gen"
+ glb_gen_dir = output_dir / "glb_gen"
+ debug_dir = output_dir / "debug"
+ image_gen_dir.mkdir(parents=True, exist_ok=True)
+ glb_gen_dir.mkdir(parents=True, exist_ok=True)
+ debug_dir.mkdir(parents=True, exist_ok=True)
+
+ object_specs = [
+ _scene_edit_object_spec(generated_object)
+ for generated_object in generated_objects
+ ]
+ log_info(
+ "scene_edit object asset generation started "
+ f"count={len(object_specs)} output_dir={output_dir}"
+ )
+ object_results = generate_text_object_assets(
+ object_specs=object_specs,
+ image_gen_dir=image_gen_dir,
+ glb_gen_dir=glb_gen_dir,
+ debug_dir=debug_dir,
+ )
+ normalized_results = [
+ _normalize_generated_asset_result(result, output_root=output_root)
+ for result in object_results
+ ]
+ succeeded = sum(
+ str(result.get("status", "")).strip() == "ok"
+ for result in normalized_results
+ )
+ status = "ok" if succeeded == len(normalized_results) else "partial"
+ if not normalized_results:
+ status = "ok"
+ log_info(
+ "scene_edit object asset generation completed "
+ f"succeeded={succeeded} failed={len(normalized_results) - succeeded}"
+ )
+ return {
+ "status": status,
+ "object_count": len(normalized_results),
+ "generated_assets": normalized_results,
+ }
+
+def build_scene_edit_layout(
+ *,
+ scene_state: dict[str, Any],
+ resolved_intent: dict[str, Any],
+ generated_assets: list[dict[str, Any]],
+ output_root: Path,
+ optimize_new_objects_only: bool = False,
+) -> dict[str, Any]:
+ """Build an edited 2D layout on top of the previous scene state."""
+ support_region = extract_scene_edit_support_region(scene_state)
+ old_footprints = extract_scene_object_footprints(scene_state)
+ old_objects_by_id = {
+ str(obj.get("id", "")).strip(): obj
+ for obj in scene_state.get("objects", []) or []
+ if isinstance(obj, dict) and str(obj.get("id", "")).strip()
+ }
+ deleted_ids = {
+ str(object_id).strip()
+ for object_id in resolved_intent.get("deleted_object_ids", []) or []
+ if str(object_id).strip()
+ }
+ operations = [
+ op for op in resolved_intent.get("operations", []) or [] if isinstance(op, dict)
+ ]
+ updated_relations = [
+ relation
+ for relation in resolved_intent.get("updated_relations", []) or []
+ if isinstance(relation, dict)
+ ]
+ updated_grids = {
+ str(object_id).strip(): str(grid).strip()
+ for object_id, grid in (resolved_intent.get("updated_grid_assignments") or {}).items()
+ if str(object_id).strip() and str(grid).strip()
+ }
+ generated_asset_by_id = {
+ str(asset.get("id", "")).strip(): asset
+ for asset in generated_assets
+ if isinstance(asset, dict)
+ and str(asset.get("id", "")).strip()
+ and str(asset.get("status", "")).strip() == "ok"
+ }
+
+ replacement_target_by_new_id: dict[str, str] = {}
+ placement_by_new_id: dict[str, dict[str, Any]] = {}
+ added_ids: list[str] = []
+ replaced_ids: list[str] = []
+ explicit_reposition_replace_ids: set[str] = set()
+ for operation in operations:
+ op_type = str(operation.get("type", "")).strip()
+ new_id = str(operation.get("new_object_temp_id", "")).strip()
+ if not new_id:
+ continue
+ placement = operation.get("placement")
+ if isinstance(placement, dict):
+ placement_by_new_id[new_id] = placement
+ if op_type == "replace":
+ target_id = str(operation.get("target_object_id", "")).strip()
+ if target_id:
+ replacement_target_by_new_id[new_id] = target_id
+ replaced_ids.append(new_id)
+ placement_type = (
+ str(placement.get("type", "")).strip()
+ if isinstance(placement, dict)
+ else ""
+ )
+ if placement_type not in {"", "preserve_target"}:
+ explicit_reposition_replace_ids.add(new_id)
+ elif op_type == "add":
+ added_ids.append(new_id)
+
+ final_items: dict[str, dict[str, Any]] = {}
+ for object_id, obj in old_objects_by_id.items():
+ if object_id in deleted_ids:
+ continue
+ footprint = old_footprints.get(object_id)
+ if footprint is None:
+ continue
+ final_items[object_id] = {
+ "id": object_id,
+ "name": str(obj.get("name", "")).strip(),
+ "description": str(obj.get("description", "")).strip(),
+ "action": "keep",
+ "center_xy": list(footprint["center_xy"]),
+ "size_xy": list(footprint["size_xy"]),
+ "footprint_2d": footprint,
+ "source": "previous_scene",
+ }
+
+ generated_ids = sorted(generated_asset_by_id)
+ if not generated_ids:
+ return {
+ "status": "ok",
+ "support_region": support_region,
+ "deleted_object_ids": sorted(deleted_ids),
+ "layout_updates": sorted(final_items.values(), key=lambda item: item["id"]),
+ "optimization": {
+ "method": "reuse_previous_scene",
+ "generated_object_count": 0,
+ },
+ }
+
+ xy_sizes = {
+ object_id: np.asarray(
+ LayoutManager.compute_simready_glb_xy_size(
+ glb_path=_resolve_generated_asset_path(
+ generated_asset_by_id[object_id],
+ output_root=output_root,
+ ),
+ metric_scale=generated_asset_by_id[object_id].get("metric_scale"),
+ ),
+ dtype=np.float64,
+ )
+ for object_id in generated_ids
+ }
+ fixed_ids = set(replaced_ids)
+
+ for object_id in replaced_ids:
+ if object_id not in generated_asset_by_id:
+ continue
+ target_id = replacement_target_by_new_id.get(object_id, "")
+ target_footprint = old_footprints.get(target_id)
+ if target_footprint is None:
+ continue
+ asset = generated_asset_by_id[object_id]
+ center_xy = LayoutManager.clamp_center_to_support_region(
+ center_xy=list(target_footprint["center_xy"]),
+ size_xy=xy_sizes[object_id].tolist(),
+ support_region=support_region,
+ )
+ final_items[object_id] = {
+ "id": object_id,
+ "name": str(asset.get("name", "")).strip(),
+ "description": str(asset.get("description", "")).strip(),
+ "action": "replace",
+ "replaces": target_id,
+ "center_xy": center_xy,
+ "size_xy": xy_sizes[object_id].tolist(),
+ "footprint_2d": LayoutManager.build_xy_footprint(
+ center_xy=center_xy,
+ size_xy=xy_sizes[object_id].tolist(),
+ ),
+ "source": "generated_asset",
+ "simready_geometry_path": asset.get("simready_geometry_path")
+ or asset.get("mesh_path"),
+ }
+
+ initialized_added_centers = _initialize_added_object_centers(
+ added_ids=[object_id for object_id in added_ids if object_id in generated_asset_by_id],
+ placement_by_new_id=placement_by_new_id,
+ updated_grids=updated_grids,
+ updated_relations=updated_relations,
+ stable_items=final_items,
+ support_region=support_region,
+ xy_sizes=xy_sizes,
+ )
+ for object_id in added_ids:
+ if object_id not in generated_asset_by_id or object_id not in initialized_added_centers:
+ continue
+ asset = generated_asset_by_id[object_id]
+ center_xy = initialized_added_centers[object_id].tolist()
+ size_xy = xy_sizes[object_id].tolist()
+ final_items[object_id] = {
+ "id": object_id,
+ "name": str(asset.get("name", "")).strip(),
+ "description": str(asset.get("description", "")).strip(),
+ "action": "add",
+ "replaces": "",
+ "center_xy": center_xy,
+ "size_xy": size_xy,
+ "footprint_2d": LayoutManager.build_xy_footprint(center_xy=center_xy, size_xy=size_xy),
+ "source": "generated_asset",
+ "simready_geometry_path": asset.get("simready_geometry_path")
+ or asset.get("mesh_path"),
+ }
+
+ initial_centers_all = {
+ object_id: np.asarray(item["center_xy"], dtype=np.float64)
+ for object_id, item in final_items.items()
+ }
+ optimized_centers = {object_id: center.copy() for object_id, center in initial_centers_all.items()}
+ optimization_metadata: dict[str, Any] | None = None
+ all_object_ids = sorted(final_items)
+ if all_object_ids:
+ fixed_object_ids: list[str] = []
+ if optimize_new_objects_only:
+ movable_ids = set(added_ids) | explicit_reposition_replace_ids
+ fixed_object_ids = [
+ object_id
+ for object_id in all_object_ids
+ if object_id not in movable_ids
+ ]
+ gym_config = load_json_object(PipelinePaths(output_root).gym_config)
+ rigid_objects = gym_config.get("rigid_object")
+ if not isinstance(rigid_objects, list):
+ raise ValueError("gym_config rigid_object must be a list.")
+ rigid_by_id = {
+ str(item.get("uid", "")).strip(): item
+ for item in rigid_objects
+ if isinstance(item, dict) and str(item.get("uid", "")).strip()
+ }
+ sa_runtime_root = output_root / "scene_edit" / "sa_node3_5_runtime"
+ optimized_layout = LayoutManager.optimize_scene_edit_layout_with_sa_node3_5(
+ output_root=output_root,
+ support_region=support_region,
+ layout_items=final_items,
+ updated_relations=updated_relations,
+ updated_grids=updated_grids,
+ fixed_object_ids=fixed_object_ids,
+ rigid_by_id=rigid_by_id,
+ generated_asset_by_id=generated_asset_by_id,
+ runtime_root=sa_runtime_root,
+ )
+ all_optimized = {
+ object_id: np.asarray(center, dtype=np.float64)
+ for object_id, center in optimized_layout.get("centers", {}).items()
+ }
+ for object_id, center in all_optimized.items():
+ optimized_centers[object_id] = np.asarray(
+ LayoutManager.clamp_center_to_support_region(
+ center_xy=center.tolist(),
+ size_xy=final_items[object_id]["size_xy"],
+ support_region=support_region,
+ ),
+ dtype=np.float64,
+ )
+ optimization_metadata = optimized_layout.get("metadata")
+
+ for object_id, item in final_items.items():
+ center_xy = optimized_centers[object_id].tolist()
+ size_xy = item["size_xy"]
+ item["center_xy"] = center_xy
+ item["footprint_2d"] = LayoutManager.build_xy_footprint(center_xy=center_xy, size_xy=size_xy)
+
+ return {
+ "status": "ok",
+ "support_region": support_region,
+ "deleted_object_ids": sorted(deleted_ids),
+ "layout_updates": sorted(final_items.values(), key=lambda item: item["id"]),
+ "optimization": {
+ "method": "delete_then_replace_then_add_initialize_then_optimize",
+ "generated_object_count": len(generated_ids),
+ "fixed_replacement_count": len(fixed_ids),
+ "replaced_object_count": len(replaced_ids),
+ "added_object_count": len(initialized_added_centers),
+ "initialized_added_object_count": len(initialized_added_centers),
+ "optimized_object_count": len(all_object_ids),
+ "optimize_new_objects_only": optimize_new_objects_only,
+ "added_layout_optimization": optimization_metadata,
+ },
+ }
+
+
+def export_scene_edit_gym_state(
+ *,
+ output_root: Path,
+ scene_state: dict[str, Any],
+ generated_assets: list[dict[str, Any]],
+ layout_updates: list[dict[str, Any]],
+ output_dir: Path,
+) -> dict[str, Any]:
+ """Update gym_export files from scene-edit layout results."""
+ paths = PipelinePaths(output_root)
+ gym_config_path = paths.gym_config
+ if not gym_config_path.is_file():
+ raise FileNotFoundError(f"gym_config.json not found: {gym_config_path}")
+ gym_config = load_json_object(gym_config_path)
+ rigid_objects = gym_config.get("rigid_object")
+ if not isinstance(rigid_objects, list):
+ raise ValueError("gym_config rigid_object must be a list.")
+
+ scene_objects = scene_state.get("objects")
+ if not isinstance(scene_objects, list):
+ raise ValueError("scene_state objects must be a list.")
+
+ rigid_by_id = {
+ str(item.get("uid", "")).strip(): item
+ for item in rigid_objects
+ if isinstance(item, dict) and str(item.get("uid", "")).strip()
+ }
+ scene_by_id = {
+ str(item.get("id", "")).strip(): item
+ for item in scene_objects
+ if isinstance(item, dict) and str(item.get("id", "")).strip()
+ }
+ generated_asset_by_id = {
+ str(item.get("id", "")).strip(): item
+ for item in generated_assets
+ if isinstance(item, dict) and str(item.get("id", "")).strip()
+ }
+ layout_by_id = {
+ str(item.get("id", "")).strip(): item
+ for item in layout_updates
+ if isinstance(item, dict) and str(item.get("id", "")).strip()
+ }
+
+ scene_state_dir = output_root / "gym_export" / "scene_state"
+ mesh_assets_dir = output_root / "gym_export" / "mesh_assets"
+ scene_state_dir.mkdir(parents=True, exist_ok=True)
+ mesh_assets_dir.mkdir(parents=True, exist_ok=True)
+
+ gym_config = load_json_object(PipelinePaths(output_root).gym_config)
+ table_surface_height = _infer_scene_edit_table_surface_height(
+ output_root=output_root,
+ gym_config=gym_config,
+ )
+
+ updated_rigid_objects: list[dict[str, Any]] = []
+ updated_scene_objects: list[dict[str, Any]] = []
+ updated_files: list[str] = []
+
+ for object_id, layout_item in layout_by_id.items():
+ action = str(layout_item.get("action", "")).strip()
+ center_xy = layout_item.get("center_xy")
+ size_xy = layout_item.get("size_xy")
+ if not (
+ isinstance(center_xy, list)
+ and len(center_xy) == 2
+ and isinstance(size_xy, list)
+ and len(size_xy) == 2
+ ):
+ continue
+ old_rigid = rigid_by_id.get(object_id)
+ old_scene_obj = scene_by_id.get(object_id)
+ if action == "keep" and old_rigid is None:
+ continue
+
+ if action == "keep":
+ updated_rigid = _update_existing_rigid_object(
+ object_id=object_id,
+ rigid_object=old_rigid,
+ old_scene_object=old_scene_obj,
+ layout_item=layout_item,
+ )
+ else:
+ generated_asset = generated_asset_by_id.get(object_id)
+ if generated_asset is None:
+ raise ValueError(f"Missing generated asset for edited object: {object_id}")
+ updated_rigid = _build_generated_rigid_object(
+ object_id=object_id,
+ layout_item=layout_item,
+ generated_asset=generated_asset,
+ output_root=output_root,
+ mesh_assets_dir=mesh_assets_dir,
+ table_height=table_surface_height + 0.01,
+ )
+ shape = updated_rigid.get("shape")
+ if isinstance(shape, dict):
+ updated_files.append(str(shape.get("fpath", "")))
+
+ updated_rigid_objects.append(updated_rigid)
+ updated_scene_objects.append(
+ _build_scene_state_object(
+ object_id=object_id,
+ layout_item=layout_item,
+ rigid_object=updated_rigid,
+ output_root=output_root,
+ )
+ )
+
+ gym_config["rigid_object"] = updated_rigid_objects
+ write_json(gym_config_path, gym_config)
+ updated_files.append(relative_path(gym_config_path, output_root))
+
+ topdown_path = scene_state_dir / "topdown_2d.png"
+ _render_scene_state_topdown(
+ support_region=extract_scene_edit_support_region(scene_state),
+ objects=updated_scene_objects,
+ output_path=topdown_path,
+ )
+ updated_files.append(relative_path(topdown_path, output_root))
+
+ state_payload = dict(scene_state)
+ state_payload["gym_config_path"] = str(gym_config_path.relative_to(output_root / "gym_export"))
+ state_payload["topdown_2d_plot_path"] = str(topdown_path.relative_to(output_root / "gym_export"))
+ state_payload["objects"] = updated_scene_objects
+ source_snapshots = dict(scene_state.get("source_snapshots") or {})
+ layout_snapshot_path = scene_state_dir / "scene_edit_layout.json"
+ write_json(
+ layout_snapshot_path,
+ {"layout_updates": layout_updates},
+ )
+ source_snapshots["scene_edit_layout"] = str(
+ layout_snapshot_path.relative_to(output_root / "gym_export")
+ )
+ state_payload["source_snapshots"] = source_snapshots
+ scene_state_result_path = scene_state_dir / "result.json"
+ write_json(scene_state_result_path, state_payload)
+ updated_files.append(relative_path(scene_state_result_path, output_root))
+ updated_files.append(relative_path(layout_snapshot_path, output_root))
+
+ return {
+ "status": "ok",
+ "updated_files": sorted(set(updated_files)),
+ "object_count": len(updated_scene_objects),
+ "gym_config_path": str(gym_config_path),
+ "scene_state_path": str(scene_state_result_path),
+ }
+
+
+def validate_scene_edit_intent(
+ *,
+ intent: dict[str, Any],
+ scene_objects: list[dict[str, str]],
+) -> None:
+ """Validate that an edit intent only references legal object ids."""
+ existing_ids = {obj["id"] for obj in scene_objects if obj.get("id")}
+ deleted_ids = _string_set(intent.get("deleted_object_ids"), "deleted_object_ids")
+ unknown_deleted = sorted(deleted_ids - existing_ids)
+ if unknown_deleted:
+ raise ValueError(
+ "Scene edit intent deleted unknown object ids: "
+ f"{unknown_deleted}"
+ )
+
+ generated_objects = intent.get("generated_objects")
+ if not isinstance(generated_objects, list):
+ raise ValueError("Scene edit intent generated_objects must be a list.")
+ generated_ids: set[str] = set()
+ for generated in generated_objects:
+ if not isinstance(generated, dict):
+ raise ValueError("Scene edit intent generated_objects entries must be objects.")
+ temp_id = str(generated.get("temp_id", "")).strip()
+ if not temp_id:
+ raise ValueError("Scene edit intent generated object has empty temp_id.")
+ if temp_id in existing_ids:
+ raise ValueError(
+ f"Scene edit generated temp_id collides with existing id: {temp_id}"
+ )
+ if temp_id in generated_ids:
+ raise ValueError(
+ f"Scene edit generated temp_id is duplicated: {temp_id}"
+ )
+ generated_ids.add(temp_id)
+
+ operations = intent.get("operations")
+ if not isinstance(operations, list):
+ raise ValueError("Scene edit intent operations must be a list.")
+ for operation in operations:
+ if not isinstance(operation, dict):
+ raise ValueError("Scene edit intent operation entries must be objects.")
+ op_type = str(operation.get("type", "")).strip()
+ target_id = str(operation.get("target_object_id", "")).strip()
+ new_temp_id = str(operation.get("new_object_temp_id", "")).strip()
+ if op_type in {"delete", "replace"} and target_id not in existing_ids:
+ raise ValueError(
+ f"Scene edit {op_type} operation targets unknown object id: "
+ f"{target_id}"
+ )
+ if op_type == "delete" and target_id not in deleted_ids:
+ raise ValueError(
+ f"Scene edit delete target is missing from deleted_object_ids: "
+ f"{target_id}"
+ )
+ if op_type == "replace":
+ if target_id not in deleted_ids:
+ raise ValueError(
+ "Scene edit replace target is missing from deleted_object_ids: "
+ f"{target_id}"
+ )
+ if new_temp_id not in generated_ids:
+ raise ValueError(
+ "Scene edit replace operation references unknown generated "
+ f"temp_id: {new_temp_id}"
+ )
+ if op_type == "add" and new_temp_id not in generated_ids:
+ raise ValueError(
+ f"Scene edit add operation references unknown generated temp_id: {new_temp_id}"
+ )
+ placement = operation.get("placement")
+ if isinstance(placement, dict):
+ reference_id = str(placement.get("reference_object_id", "")).strip()
+ if reference_id and reference_id not in existing_ids:
+ raise ValueError(
+ "Scene edit placement references unknown object id: "
+ f"{reference_id}"
+ )
+
+
+def _scene_edit_object_spec(generated_object: dict[str, Any]) -> dict[str, Any]:
+ temp_id = str(generated_object.get("temp_id", "")).strip()
+ name = str(generated_object.get("name", "")).strip()
+ class_candidates = [name] if name else []
+ return {
+ "id": temp_id,
+ "name": name,
+ "description": str(generated_object.get("description", "")).strip(),
+ "class_candidate": class_candidates,
+ }
+
+
+def _normalize_generated_asset_result(
+ result: dict[str, Any],
+ *,
+ output_root: Path,
+) -> dict[str, Any]:
+ normalized = dict(result)
+ for key in (
+ "image_path",
+ "raw_geometry_path",
+ "mesh_path",
+ "simready_geometry_path",
+ ):
+ value = normalized.get(key)
+ if value:
+ normalized[key] = relative_path(value, output_root)
+ return normalized
+
+
+def _map_relation_endpoint(
+ *,
+ object_id: str,
+ deleted_ids: set[str],
+ replacement_map: dict[str, str],
+ replacement_inherits: set[str],
+) -> str | None:
+ if object_id in deleted_ids:
+ replacement_id = replacement_map.get(object_id)
+ if replacement_id and object_id in replacement_inherits:
+ return replacement_id
+ return None
+ return object_id
+
+
+def _normalize_generated_objects(
+ *,
+ operations: list[dict[str, Any]],
+ generated_objects: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ operation_type_by_temp_id: dict[str, str] = {}
+ for operation in operations:
+ new_temp_id = str(operation.get("new_object_temp_id", "")).strip()
+ op_type = str(operation.get("type", "")).strip()
+ if new_temp_id and op_type in {"add", "replace"}:
+ operation_type_by_temp_id[new_temp_id] = op_type
+
+ normalized: list[dict[str, Any]] = []
+ for generated in generated_objects:
+ temp_id = str(generated.get("temp_id", "")).strip()
+ if not temp_id:
+ continue
+ source_operation = str(generated.get("source_operation", "")).strip()
+ normalized.append(
+ {
+ **generated,
+ "source_operation": (
+ source_operation or operation_type_by_temp_id.get(temp_id, "add")
+ ),
+ }
+ )
+ return normalized
+
+
+def _normalize_scene_edit_intent_ids(
+ *,
+ intent: dict[str, Any],
+ scene_objects: list[dict[str, str]],
+) -> dict[str, Any]:
+ """Make generated temp ids internally consistent and unique for this scene."""
+ normalized = json.loads(json.dumps(intent))
+ existing_ids = {
+ str(obj.get("id", "")).strip()
+ for obj in scene_objects
+ if str(obj.get("id", "")).strip()
+ }
+ generated_objects = [
+ obj
+ for obj in normalized.get("generated_objects", [])
+ if isinstance(obj, dict)
+ ]
+ operations = [
+ op
+ for op in normalized.get("operations", [])
+ if isinstance(op, dict)
+ ]
+
+ generated_ids = {
+ str(obj.get("temp_id", "")).strip()
+ for obj in generated_objects
+ if str(obj.get("temp_id", "")).strip()
+ }
+ referenced_ids = {
+ str(op.get("new_object_temp_id", "")).strip()
+ for op in operations
+ if str(op.get("type", "")).strip() in {"add", "replace"}
+ and str(op.get("new_object_temp_id", "")).strip()
+ }
+
+ unused_generated_ids = [
+ str(obj.get("temp_id", "")).strip()
+ for obj in generated_objects
+ if str(obj.get("temp_id", "")).strip()
+ and str(obj.get("temp_id", "")).strip() not in referenced_ids
+ ]
+ for operation in operations:
+ op_type = str(operation.get("type", "")).strip()
+ if op_type not in {"add", "replace"}:
+ continue
+ new_temp_id = str(operation.get("new_object_temp_id", "")).strip()
+ if not new_temp_id or new_temp_id in generated_ids:
+ continue
+ if len(unused_generated_ids) == 1:
+ operation["new_object_temp_id"] = unused_generated_ids[0]
+ elif len(generated_objects) == 1:
+ operation["new_object_temp_id"] = str(
+ generated_objects[0].get("temp_id", "")
+ ).strip()
+
+ reserved = set(existing_ids)
+ seen_generated: set[str] = set()
+ rename_by_old_id: dict[str, str] = {}
+ for generated in generated_objects:
+ old_id = str(generated.get("temp_id", "")).strip()
+ if not old_id:
+ continue
+ new_id = old_id
+ if new_id in reserved or new_id in seen_generated:
+ new_id = _unique_scene_edit_generated_id(
+ base_id=old_id,
+ reserved_ids=reserved | seen_generated,
+ )
+ generated["temp_id"] = new_id
+ seen_generated.add(new_id)
+ reserved.add(new_id)
+ if new_id != old_id:
+ rename_by_old_id[old_id] = new_id
+
+ if rename_by_old_id:
+ for operation in operations:
+ new_temp_id = str(operation.get("new_object_temp_id", "")).strip()
+ if new_temp_id in rename_by_old_id:
+ operation["new_object_temp_id"] = rename_by_old_id[new_temp_id]
+
+ return normalized
+
+
+def _unique_scene_edit_generated_id(
+ *,
+ base_id: str,
+ reserved_ids: set[str],
+) -> str:
+ base = re.sub(r"_\d+$", "", base_id.strip()) or "new_object"
+ index = 0
+ while True:
+ candidate = f"{base}_{index}"
+ if candidate not in reserved_ids:
+ return candidate
+ index += 1
+
+
+def _map_reference_endpoint(
+ *,
+ object_id: str,
+ deleted_ids: set[str],
+ replacement_map: dict[str, str],
+) -> str | None:
+ if object_id in replacement_map:
+ return replacement_map[object_id]
+ if object_id in deleted_ids:
+ return None
+ return object_id
+
+
+def _placement_relation_to_canonical(
+ *,
+ new_object_id: str,
+ relation: str,
+ reference_object_id: str,
+) -> dict[str, str] | None:
+ if not new_object_id or not reference_object_id:
+ return None
+ if relation == "left_of":
+ return {
+ "subject": new_object_id,
+ "relation": "left_of",
+ "object": reference_object_id,
+ }
+ if relation == "right_of":
+ return {
+ "subject": reference_object_id,
+ "relation": "left_of",
+ "object": new_object_id,
+ }
+ if relation == "front_of":
+ return {
+ "subject": new_object_id,
+ "relation": "front_of",
+ "object": reference_object_id,
+ }
+ if relation == "back_of":
+ return {
+ "subject": reference_object_id,
+ "relation": "front_of",
+ "object": new_object_id,
+ }
+ return None
+
+
+def _close_relations_with_sources(
+ direct_relations: list[dict[str, str]],
+) -> list[dict[str, str]]:
+ if not direct_relations:
+ return []
+ source_by_edge = {
+ (
+ str(relation.get("subject", "")).strip(),
+ str(relation.get("relation", "")).strip(),
+ str(relation.get("object", "")).strip(),
+ ): str(relation.get("source", "")).strip()
+ for relation in direct_relations
+ }
+ closed = transitive_relation_closure(direct_relations)
+ result: list[dict[str, str]] = []
+ for relation in closed:
+ key = (
+ relation["subject"],
+ relation["relation"],
+ relation["object"],
+ )
+ source = source_by_edge.get(key)
+ result.append(
+ {
+ "subject": relation["subject"],
+ "relation": relation["relation"],
+ "object": relation["object"],
+ "source": source or "transitive_closure",
+ }
+ )
+ return result
+
+
+def _string_set(value: Any, context: str) -> set[str]:
+ if not isinstance(value, list):
+ raise ValueError(f"Scene edit intent {context} must be a list.")
+ result: set[str] = set()
+ for item in value:
+ text = str(item).strip()
+ if not text:
+ raise ValueError(f"Scene edit intent {context} contains an empty id.")
+ result.add(text)
+ return result
+
+
+def _resolve_generated_asset_path(asset: dict[str, Any], *, output_root: Path) -> Path:
+ value = asset.get("simready_geometry_path") or asset.get("mesh_path")
+ path = Path(str(value or "")).expanduser()
+ if path.is_absolute():
+ return path.resolve()
+ return (output_root / path).resolve()
+
+
+def _infer_scene_edit_table_surface_height(
+ *,
+ output_root: Path,
+ gym_config: dict[str, Any],
+) -> float:
+ try:
+ import trimesh
+ import trimesh.transformations as tt
+ except ImportError:
+ return 0.0
+
+ background = gym_config.get("background")
+ if not isinstance(background, list) or not background:
+ return 0.0
+ table = background[0]
+ if not isinstance(table, dict):
+ return 0.0
+
+ shape = table.get("shape")
+ if not isinstance(shape, dict):
+ return 0.0
+ fpath = str(shape.get("fpath", "") or "").strip()
+ if not fpath:
+ return 0.0
+ table_mesh_path = (output_root / "gym_export" / fpath).resolve()
+ if not table_mesh_path.is_file():
+ return 0.0
+
+ scene = trimesh.load(table_mesh_path, force="scene")
+ if isinstance(scene, trimesh.Trimesh):
+ mesh = scene
+ else:
+ dumped = scene.dump(concatenate=True)
+ if isinstance(dumped, trimesh.Trimesh):
+ mesh = dumped
+ else:
+ meshes = [item for item in dumped if isinstance(item, trimesh.Trimesh)]
+ if not meshes:
+ return 0.0
+ mesh = trimesh.util.concatenate(meshes)
+
+ verts = np.asarray(mesh.vertices, dtype=np.float64)
+ if verts.size == 0:
+ return 0.0
+
+ body_scale = np.asarray(table.get("body_scale") or [1.0, 1.0, 1.0], dtype=np.float64)
+ if body_scale.shape != (3,) or not np.all(np.isfinite(body_scale)):
+ body_scale = np.ones(3, dtype=np.float64)
+ glb_scale = np.asarray([body_scale[0], body_scale[2], body_scale[1]], dtype=np.float64)
+ verts = verts * glb_scale.reshape(1, 3)
+
+ init_rot = np.asarray(table.get("init_rot") or [0.0, 0.0, 0.0], dtype=np.float64)
+ if init_rot.shape == (3,) and np.any(np.abs(init_rot) > 1.0e-8):
+ rot = tt.euler_matrix(
+ float(np.deg2rad(init_rot[0])),
+ float(np.deg2rad(init_rot[1])),
+ float(np.deg2rad(init_rot[2])),
+ axes="sxyz",
+ )
+ verts = (rot[:3, :3] @ verts.T).T
+
+ init_pos = np.asarray(table.get("init_pos") or [0.0, 0.0, 0.0], dtype=np.float64)
+ if init_pos.shape != (3,) or not np.all(np.isfinite(init_pos)):
+ init_pos = np.zeros(3, dtype=np.float64)
+
+ return float(init_pos[2] + np.max(verts[:, 1]))
+
+
+def _update_existing_rigid_object(
+ *,
+ object_id: str,
+ rigid_object: dict[str, Any] | None,
+ old_scene_object: dict[str, Any] | None,
+ layout_item: dict[str, Any],
+) -> dict[str, Any]:
+ if rigid_object is None:
+ raise ValueError(f"Missing rigid_object for existing scene object: {object_id}")
+ updated = json.loads(json.dumps(rigid_object))
+ old_center = _scene_edit_center_xy(old_scene_object)
+ new_center = np.asarray(layout_item.get("center_xy", []), dtype=np.float64)
+ init_pos = list(updated.get("init_pos") or [0.0, 0.0, 0.0])
+ if old_center is not None and new_center.shape == (2,):
+ delta = new_center - old_center
+ init_pos[0] = float(init_pos[0]) + float(delta[0])
+ init_pos[1] = float(init_pos[1]) + float(delta[1])
+ updated["init_pos"] = [float(value) for value in init_pos]
+ updated["description"] = str(layout_item.get("description", "")).strip() or str(
+ updated.get("description", "")
+ ).strip()
+ return updated
+
+
+def _build_generated_rigid_object(
+ *,
+ object_id: str,
+ layout_item: dict[str, Any],
+ generated_asset: dict[str, Any],
+ output_root: Path,
+ mesh_assets_dir: Path,
+ table_height: float,
+) -> dict[str, Any]:
+ simready_path = _resolve_generated_asset_path(generated_asset, output_root=output_root)
+ if not simready_path.is_file():
+ raise FileNotFoundError(f"Generated simready GLB not found: {simready_path}")
+ safe_name = object_id.replace("interact_", "").strip("_") or "object"
+ object_dir = mesh_assets_dir / safe_name / object_id
+ object_dir.mkdir(parents=True, exist_ok=True)
+ object_dst = object_dir / f"{object_id}.glb"
+ shutil.copy2(simready_path, object_dst)
+
+ metric_scale = generated_asset.get("metric_scale")
+ scale_factor = 1.0
+ if isinstance(metric_scale, dict):
+ try:
+ scale_factor = float(metric_scale.get("scale_factor", 1.0))
+ except (TypeError, ValueError):
+ scale_factor = 1.0
+ if not np.isfinite(scale_factor) or scale_factor <= 0.0:
+ scale_factor = 1.0
+ body_scale = _glb_scale_to_sim([scale_factor, scale_factor, scale_factor])
+ init_rot = [0.0, 0.0, 0.0]
+ target_center = np.asarray(layout_item.get("center_xy", []), dtype=np.float64)
+ if target_center.shape != (2,):
+ raise ValueError(f"Missing center_xy for generated object: {object_id}")
+ init_pos = [
+ float(target_center[0]),
+ float(target_center[1]),
+ float(table_height),
+ ]
+ return {
+ "uid": object_id,
+ "description": str(layout_item.get("description", "")).strip(),
+ "shape": {
+ "shape_type": "Mesh",
+ "fpath": str(object_dst.relative_to(output_root / "gym_export")),
+ "compute_uv": False,
+ },
+ "attrs": {
+ "mass": 0.01,
+ "contact_offset": 0.003,
+ "rest_offset": 0.001,
+ "restitution": 0.01,
+ "max_depenetration_velocity": 10.0,
+ "min_position_iters": 32,
+ "min_velocity_iters": 8,
+ },
+ "body_type": "dynamic",
+ "init_pos": init_pos,
+ "init_rot": init_rot,
+ "body_scale": body_scale,
+ "max_convex_hull_num": 16,
+ }
+
+
+def _build_scene_state_object(
+ *,
+ object_id: str,
+ layout_item: dict[str, Any],
+ rigid_object: dict[str, Any],
+ output_root: Path,
+) -> dict[str, Any]:
+ init_rot = [float(value) for value in rigid_object.get("init_rot") or [0.0, 0.0, 0.0]]
+ body_scale = [float(value) for value in rigid_object.get("body_scale") or [1.0, 1.0, 1.0]]
+ init_pos = [float(value) for value in rigid_object.get("init_pos") or [0.0, 0.0, 0.0]]
+ footprint_2d = layout_item.get("footprint_2d") or LayoutManager.build_xy_footprint(
+ center_xy=list(layout_item.get("center_xy", [0.0, 0.0])),
+ size_xy=list(layout_item.get("size_xy", [0.0, 0.0])),
+ )
+ return {
+ "id": object_id,
+ "name": str(layout_item.get("name", "")).strip() or object_id,
+ "role": "interact",
+ "description": str(layout_item.get("description", "")).strip(),
+ "init_pos": init_pos,
+ "init_rot": init_rot,
+ "body_scale": body_scale,
+ "footprint_2d": footprint_2d,
+ }
+
+
+def _scene_edit_center_xy(scene_object: dict[str, Any] | None) -> np.ndarray | None:
+ if not isinstance(scene_object, dict):
+ return None
+ footprint = scene_object.get("footprint_2d")
+ if not isinstance(footprint, dict):
+ return None
+ center_xy = footprint.get("center_xy")
+ if not isinstance(center_xy, list) or len(center_xy) != 2:
+ return None
+ return np.asarray(center_xy, dtype=np.float64)
+
+
+def _compute_anchor_targets(
+ *,
+ generated_ids: list[str],
+ replacement_target_by_new_id: dict[str, str],
+ placement_by_new_id: dict[str, dict[str, Any]],
+ updated_grids: dict[str, str],
+ old_footprints: dict[str, dict[str, Any]],
+ support_region: dict[str, Any],
+ xy_sizes: dict[str, np.ndarray],
+) -> dict[str, np.ndarray]:
+ targets: dict[str, np.ndarray] = {}
+ unresolved = set(generated_ids)
+ for _ in range(max(len(generated_ids), 1) * 2):
+ progressed = False
+ for object_id in list(unresolved):
+ replacement_target = replacement_target_by_new_id.get(object_id)
+ if replacement_target:
+ target_footprint = old_footprints.get(replacement_target)
+ if target_footprint is None:
+ continue
+ targets[object_id] = np.asarray(
+ target_footprint["center_xy"],
+ dtype=np.float64,
+ )
+ unresolved.remove(object_id)
+ progressed = True
+ continue
+
+ placement = placement_by_new_id.get(object_id, {})
+ placement_type = str(placement.get("type", "")).strip()
+ if placement_type == "relative_to_object":
+ reference_id = str(placement.get("reference_object_id", "")).strip()
+ relation = str(placement.get("relation", "")).strip()
+ reference_center = targets.get(reference_id)
+ reference_size = xy_sizes.get(reference_id)
+ if reference_center is None:
+ reference = old_footprints.get(reference_id)
+ if reference is not None:
+ reference_center = np.asarray(
+ reference["center_xy"],
+ dtype=np.float64,
+ )
+ reference_size = np.asarray(
+ reference["size_xy"],
+ dtype=np.float64,
+ )
+ if reference_center is not None and reference_size is not None:
+ targets[object_id] = _offset_center_by_relation(
+ reference_center=reference_center,
+ reference_size=reference_size,
+ object_size=xy_sizes[object_id],
+ relation=relation,
+ )
+ unresolved.remove(object_id)
+ progressed = True
+ continue
+
+ grid_name = updated_grids.get(object_id)
+ if grid_name:
+ targets[object_id] = LayoutManager.support_region_grid_center(
+ support_region=support_region,
+ grid_name=grid_name,
+ )
+ unresolved.remove(object_id)
+ progressed = True
+ continue
+ if not progressed:
+ break
+ return targets
+
+
+def _initialize_added_object_centers(
+ *,
+ added_ids: list[str],
+ placement_by_new_id: dict[str, dict[str, Any]],
+ updated_grids: dict[str, str],
+ updated_relations: list[dict[str, Any]],
+ stable_items: dict[str, dict[str, Any]],
+ support_region: dict[str, Any],
+ xy_sizes: dict[str, np.ndarray],
+) -> dict[str, np.ndarray]:
+ if not added_ids:
+ return {}
+ support_center = LayoutManager.support_region_default_center(support_region=support_region)
+ results: dict[str, np.ndarray] = {}
+ for object_id in added_ids:
+ seed_center = support_center.copy()
+ grid_name = updated_grids.get(object_id)
+ if grid_name:
+ seed_center = LayoutManager.support_region_grid_center(
+ support_region=support_region,
+ grid_name=grid_name,
+ )
+ else:
+ placement = placement_by_new_id.get(object_id, {})
+ placement_type = str(placement.get("type", "")).strip()
+ if placement_type == "relative_to_object":
+ reference_id = str(placement.get("reference_object_id", "")).strip()
+ stable_item = stable_items.get(reference_id)
+ if stable_item is not None:
+ reference_center = np.asarray(
+ stable_item.get("center_xy", support_center.tolist()),
+ dtype=np.float64,
+ )
+ if reference_center.shape == (2,):
+ seed_center = reference_center
+ results[object_id] = np.asarray(
+ LayoutManager.clamp_center_to_support_region(
+ center_xy=seed_center.tolist(),
+ size_xy=xy_sizes[object_id].tolist(),
+ support_region=support_region,
+ ),
+ dtype=np.float64,
+ )
+ return results
+
+def _offset_center_by_relation(
+ *,
+ reference_center: np.ndarray,
+ reference_size: np.ndarray,
+ object_size: np.ndarray,
+ relation: str,
+ padding: float = 0.02,
+) -> np.ndarray:
+ gap_x = 0.5 * (reference_size[0] + object_size[0]) + padding
+ gap_y = 0.5 * (reference_size[1] + object_size[1]) + padding
+ offset = np.zeros(2, dtype=np.float64)
+ if relation == "left_of":
+ offset[0] = -gap_x
+ elif relation == "right_of":
+ offset[0] = gap_x
+ elif relation == "front_of":
+ offset[1] = -gap_y
+ elif relation in {"back_of", "behind"}:
+ offset[1] = gap_y
+ else:
+ offset = np.asarray([gap_x, 0.0], dtype=np.float64)
+ return reference_center + offset
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py
new file mode 100644
index 000000000..ac8623089
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.graph import (
+ build_scene_intake_graph,
+ run_scene_intake,
+)
+
+__all__ = ["build_scene_intake_graph", "run_scene_intake"]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py
new file mode 100644
index 000000000..77874b15c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/graph.py
@@ -0,0 +1,142 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import (
+ OpenAICompatibleLLMCfg,
+ build_chat_model,
+)
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.nodes import (
+ call_vlm_scene_intake_node,
+ call_vlm_verify_scene_intake_node,
+ normalize_scene_intake_node,
+ normalize_verified_scene_intake_node,
+ prepare_input_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.state import (
+ SceneIntakeState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["build_scene_intake_graph", "run_scene_intake"]
+
+
+def route_after_normalize(state: SceneIntakeState) -> str:
+ """Route to retry or verify after draft scene intake normalization."""
+ if state["draft_scene_intake"] is not None:
+ return "verify"
+ if state["attempt_count"] < state["max_attempts"]:
+ return "retry"
+ return "end"
+
+
+def route_after_verified_normalize(state: SceneIntakeState) -> str:
+ """Route to retry or finish after scene intake verifier normalization."""
+ if state["scene_intake"] is not None:
+ return "end"
+ if state["attempt_count"] < state["max_attempts"]:
+ return "retry"
+ return "end"
+
+
+def build_scene_intake_graph(llm: Any) -> Any:
+ """Build the fixed LangGraph scene intake workflow."""
+ graph = StateGraph(SceneIntakeState)
+ graph.add_node("prepare_input", prepare_input_node)
+ graph.add_node(
+ "call_vlm_scene_intake",
+ lambda state: call_vlm_scene_intake_node(state, llm=llm),
+ )
+ graph.add_node("normalize_scene_intake", normalize_scene_intake_node)
+ graph.add_node(
+ "call_vlm_verify_scene_intake",
+ lambda state: call_vlm_verify_scene_intake_node(state, llm=llm),
+ )
+ graph.add_node(
+ "normalize_verified_scene_intake",
+ normalize_verified_scene_intake_node,
+ )
+
+ graph.set_entry_point("prepare_input")
+ graph.add_edge("prepare_input", "call_vlm_scene_intake")
+ graph.add_edge("call_vlm_scene_intake", "normalize_scene_intake")
+ graph.add_conditional_edges(
+ "normalize_scene_intake",
+ route_after_normalize,
+ {
+ "retry": "call_vlm_scene_intake",
+ "verify": "call_vlm_verify_scene_intake",
+ "end": END,
+ },
+ )
+ graph.add_edge("call_vlm_verify_scene_intake", "normalize_verified_scene_intake")
+ graph.add_conditional_edges(
+ "normalize_verified_scene_intake",
+ route_after_verified_normalize,
+ {
+ "retry": "call_vlm_verify_scene_intake",
+ "end": END,
+ },
+ )
+ return graph.compile()
+
+
+def run_scene_intake(
+ request: Prompt2SceneInput,
+ llm_cfg: OpenAICompatibleLLMCfg,
+) -> SceneIntakeSpec:
+ """Run fixed VLM-based scene intake for one prompt2scene request."""
+ llm = build_chat_model(llm_cfg)
+ graph = build_scene_intake_graph(llm)
+ result = graph.invoke(
+ {
+ "request": request,
+ "messages": [],
+ "raw_model_output": None,
+ "draft_scene_intake": None,
+ "scene_intake": None,
+ "attempt_count": 0,
+ "max_attempts": llm_cfg.max_attempts,
+ "last_error": None,
+ "errors": [],
+ }
+ )
+
+ scene_intake = result.get("scene_intake")
+ if scene_intake is not None:
+ return scene_intake
+
+ error = format_result_missing_error(
+ "Scene intake",
+ "SceneIntakeSpec",
+ attempt_count=result.get("attempt_count", 0),
+ last_error=result.get("last_error"),
+ errors=result.get("errors", []),
+ )
+ log.log_warning(error)
+ raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py
new file mode 100644
index 000000000..5b579d78d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/nodes.py
@@ -0,0 +1,213 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.prompts.schemas import (
+ SCENE_INTAKE_JSON_SCHEMA,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.utils import (
+ log_api_request_start,
+ log,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ SCENE_INTAKE_STEP,
+ WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.llms.llm_output import (
+ StructuredModelCallError,
+ call_structured_json_model_step,
+)
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_attempt_error,
+)
+from embodichain.gen_sim.prompt2scene.prompts.builders import (
+ build_scene_intake_messages,
+ build_scene_intake_verifier_messages,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.state import (
+ SceneIntakeState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.utils import (
+ build_scene_intake_spec,
+)
+
+__all__ = [
+ "call_vlm_scene_intake_node",
+ "call_vlm_verify_scene_intake_node",
+ "normalize_scene_intake_node",
+ "normalize_verified_scene_intake_node",
+ "prepare_input_node",
+]
+
+
+def prepare_input_node(state: SceneIntakeState) -> dict[str, object]:
+ """Prepare chat messages for the scene intake model call."""
+ return {"messages": build_scene_intake_messages(state["request"])}
+
+
+def call_vlm_scene_intake_node(
+ state: SceneIntakeState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Call the configured VLM for fixed scene intake extraction."""
+ attempt_count = state["attempt_count"] + 1
+
+ try:
+ log_api_request_start(
+ step=SCENE_INTAKE_STEP,
+ request="extract",
+ attempt=attempt_count,
+ )
+ artifact_writer = WorkflowArtifactWriter(
+ state["request"].output_root,
+ SCENE_INTAKE_STEP,
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=SCENE_INTAKE_JSON_SCHEMA,
+ messages=state["messages"],
+ context="Scene intake",
+
+
+ attempt_count=attempt_count,
+
+
+ )
+ except StructuredModelCallError as exc:
+ error = format_attempt_error("Scene intake", attempt_count, exc)
+ log.log_warning(error)
+ return {
+ "attempt_count": attempt_count,
+ "raw_model_output": None,
+ "last_error": error,
+ "errors": state["errors"] + [error],
+ }
+
+ return {
+ "attempt_count": attempt_count,
+ "raw_model_output": raw_model_output,
+ "last_error": None,
+ }
+
+
+def normalize_scene_intake_node(state: SceneIntakeState) -> dict[str, object]:
+ """Normalize raw VLM JSON into a draft scene intake schema."""
+ raw_model_output = state["raw_model_output"]
+ if raw_model_output is None:
+ return {}
+
+ try:
+ scene_intake = build_scene_intake_spec(
+ request=state["request"],
+ model_output=raw_model_output,
+ )
+ except ValueError as exc:
+ error = format_attempt_error("Scene intake", state["attempt_count"], exc)
+ return {
+ "draft_scene_intake": None,
+ "last_error": error,
+ "errors": state["errors"] + [error],
+ }
+
+ return {"draft_scene_intake": scene_intake, "scene_intake": None}
+
+
+def call_vlm_verify_scene_intake_node(
+ state: SceneIntakeState,
+ *,
+ llm: Any,
+) -> dict[str, object]:
+ """Ask VLM to verify and correct scene-intake grouping and counts."""
+ draft_scene_intake = state["draft_scene_intake"]
+ if draft_scene_intake is None:
+ return {}
+
+ attempt_count = state["attempt_count"] + 1
+ messages = build_scene_intake_verifier_messages(
+ request=state["request"],
+ scene_intake=draft_scene_intake,
+ )
+
+ try:
+ log_api_request_start(
+ step=SCENE_INTAKE_STEP,
+ request="verify",
+ attempt=attempt_count,
+ )
+ artifact_writer = WorkflowArtifactWriter(
+ state["request"].output_root,
+ SCENE_INTAKE_STEP,
+ )
+ raw_model_output = call_structured_json_model_step(
+ llm=llm,
+ schema=SCENE_INTAKE_JSON_SCHEMA,
+ messages=messages,
+ context="Scene intake verifier",
+
+
+ attempt_count=attempt_count,
+
+
+ )
+ except StructuredModelCallError as exc:
+ error = format_attempt_error("Scene intake verifier", attempt_count, exc)
+ log.log_warning(error)
+ return {
+ "attempt_count": attempt_count,
+ "raw_model_output": None,
+ "scene_intake": None,
+ "last_error": error,
+ "errors": state["errors"] + [error],
+ }
+
+ return {
+ "attempt_count": attempt_count,
+ "raw_model_output": raw_model_output,
+ "scene_intake": None,
+ "last_error": None,
+ }
+
+
+def normalize_verified_scene_intake_node(
+ state: SceneIntakeState,
+) -> dict[str, object]:
+ """Normalize verifier output into the final scene intake schema."""
+ raw_model_output = state["raw_model_output"]
+ if raw_model_output is None:
+ return {}
+
+ try:
+ scene_intake = build_scene_intake_spec(
+ request=state["request"],
+ model_output=raw_model_output,
+ )
+ except ValueError as exc:
+ error = format_attempt_error("Scene intake verifier", state["attempt_count"], exc)
+ log.log_warning(error)
+ return {
+ "scene_intake": None,
+ "last_error": error,
+ "errors": state["errors"] + [error],
+ }
+
+ return {"scene_intake": scene_intake, "last_error": None}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py
new file mode 100644
index 000000000..85cac107c
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/schema.py
@@ -0,0 +1,123 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import (
+ InputKind,
+ Prompt2SceneInput,
+)
+
+__all__ = [
+ "SceneIntakeAsset",
+ "SceneIntakeInputRecord",
+ "SceneIntakeSpec",
+ "SceneIntakeTable",
+]
+
+
+@dataclass(frozen=True)
+class SceneIntakeInputRecord:
+ """Normalized input source recorded by scene intake."""
+
+ input_kind: InputKind
+ text: str | None = None
+ image_path: str | None = None
+
+ @classmethod
+ def from_request(cls, request: Prompt2SceneInput) -> "SceneIntakeInputRecord":
+ """Create an input record from a prompt2scene request."""
+ return cls(
+ input_kind=request.input_kind,
+ text=None,
+ image_path=str(request.image_path) if request.image_path else None,
+ )
+
+ def to_manifest(self) -> dict[str, str | None]:
+ """Convert the input record to JSON-safe data."""
+ return {
+ "input_kind": self.input_kind.value,
+ "text": self.text,
+ "image_path": self.image_path,
+ }
+
+
+@dataclass(frozen=True)
+class SceneIntakeTable:
+ """Table/support information extracted during scene intake."""
+
+ id: str = "table"
+ name: str = "table"
+ description: str = ""
+ complete_table_description: str = ""
+ is_complete_visible_table: bool = False
+ class_candidate: list[str] = field(default_factory=list)
+ object_coverage_percent: int | None = None
+
+ def to_manifest(self) -> dict[str, object]:
+ """Convert the table record to JSON-safe data."""
+ manifest: dict[str, object] = {
+ "id": self.id,
+ "name": self.name,
+ "description": self.description,
+ "complete_table_description": self.complete_table_description,
+ "is_complete_visible_table": self.is_complete_visible_table,
+ "class_candidate": list(self.class_candidate),
+ }
+ if self.object_coverage_percent is not None:
+ manifest["object_coverage_percent"] = self.object_coverage_percent
+ return manifest
+
+
+@dataclass(frozen=True)
+class SceneIntakeAsset:
+ """Object category group extracted during scene intake."""
+
+ id: str
+ name: str
+ count: int = 1
+ description: str = ""
+ class_candidate: list[str] = field(default_factory=list)
+
+ def to_manifest(self) -> dict[str, object]:
+ """Convert the asset record to JSON-safe data."""
+ return {
+ "id": self.id,
+ "name": self.name,
+ "count": self.count,
+ "description": self.description,
+ "class_candidate": list(self.class_candidate),
+ }
+
+
+@dataclass(frozen=True)
+class SceneIntakeSpec:
+ """Unified first-step scene intake output for text and image inputs."""
+
+ input: SceneIntakeInputRecord
+ table: SceneIntakeTable
+ assets: list[SceneIntakeAsset]
+
+ def to_manifest(self) -> dict[str, object]:
+ """Convert the intake spec to JSON-safe data."""
+ return {
+ "input": self.input.to_manifest(),
+ "table": self.table.to_manifest(),
+ "assets": [asset.to_manifest() for asset in self.assets],
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py
new file mode 100644
index 000000000..7a96619fb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/state.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["SceneIntakeState"]
+
+
+class SceneIntakeState(AttemptState):
+ """LangGraph state for the fixed scene intake workflow."""
+
+ request: Prompt2SceneInput
+ messages: list[Any]
+ raw_model_output: dict[str, Any] | None
+ draft_scene_intake: SceneIntakeSpec | None
+ scene_intake: SceneIntakeSpec | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py
new file mode 100644
index 000000000..ab8745cc5
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/scene_intake/utils.py
@@ -0,0 +1,276 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_warning
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeAsset,
+ SceneIntakeInputRecord,
+ SceneIntakeSpec,
+ SceneIntakeTable,
+)
+
+__all__ = ["build_scene_intake_spec", "normalize_asset_name"]
+
+
+def normalize_asset_name(name: str) -> str:
+ """Normalize an object name for stable asset IDs."""
+ normalized = name.strip().lower()
+ normalized = normalized.replace("-", " ").replace("/", " ")
+ normalized = re.sub(r"[^a-z0-9\s_]", "", normalized)
+ normalized = re.sub(r"\s+", "_", normalized)
+ normalized = re.sub(r"_+", "_", normalized).strip("_")
+ return normalized or "object"
+
+
+def build_scene_intake_spec(
+ *,
+ request: Prompt2SceneInput,
+ model_output: dict[str, Any],
+) -> SceneIntakeSpec:
+ """Normalize raw VLM JSON into the stable scene intake schema."""
+ _validate_exact_keys(
+ model_output,
+ allowed_keys={"table", "assets"},
+ context="Scene intake model output",
+ )
+ input_record = SceneIntakeInputRecord.from_request(request)
+ table = _parse_table(_require_mapping(model_output.get("table"), "table"))
+ assets = _parse_assets(_require_list(model_output.get("assets"), "assets"))
+ return SceneIntakeSpec(input=input_record, table=table, assets=assets)
+
+
+def _parse_table(raw_table: dict[str, Any]) -> SceneIntakeTable:
+ _validate_exact_keys(
+ raw_table,
+ allowed_keys={
+ "name",
+ "description",
+ "complete_table_description",
+ "is_complete_visible_table",
+ "class_candidate",
+ "object_coverage_percent",
+ },
+ context="Scene intake table",
+ )
+
+ if "name" not in raw_table:
+ raise ValueError("Scene intake table.name is required.")
+ raw_name = str(raw_table["name"]).strip()
+ if not raw_name:
+ raise ValueError("Scene intake table.name must be non-empty.")
+ name = normalize_asset_name(raw_name)
+
+ if "description" not in raw_table:
+ raise ValueError("Scene intake table.description is required.")
+ description = str(raw_table["description"]).strip()
+ if not description:
+ raise ValueError("Scene intake table.description must be non-empty.")
+
+ if "complete_table_description" not in raw_table:
+ raise ValueError("Scene intake table.complete_table_description is required.")
+ complete_table_description = str(
+ raw_table["complete_table_description"]
+ ).strip()
+ if not complete_table_description:
+ raise ValueError(
+ "Scene intake table.complete_table_description must be non-empty."
+ )
+
+ if "is_complete_visible_table" not in raw_table:
+ raise ValueError("Scene intake table.is_complete_visible_table is required.")
+ is_complete_visible_table = raw_table["is_complete_visible_table"]
+ if not isinstance(is_complete_visible_table, bool):
+ raise ValueError(
+ "Scene intake table.is_complete_visible_table must be a boolean."
+ )
+
+ class_candidate = _parse_class_candidate(
+ raw_table.get("class_candidate"),
+ asset_index="table",
+ raw_name=name,
+ )
+
+ object_coverage_percent: int | None = None
+ if is_complete_visible_table:
+ object_coverage_percent = 60
+ else:
+ raw_percent = raw_table.get("object_coverage_percent")
+ if raw_percent is not None:
+ if isinstance(raw_percent, bool):
+ raise ValueError(
+ "Scene intake table.object_coverage_percent must be an integer, "
+ "not a boolean."
+ )
+ try:
+ object_coverage_percent = int(raw_percent)
+ except (TypeError, ValueError):
+ raise ValueError(
+ "Scene intake table.object_coverage_percent must be an integer "
+ f"between 1 and 100, got {raw_percent!r}."
+ )
+ if object_coverage_percent not in (10, 30, 50, 70):
+ raise ValueError(
+ "Scene intake table.object_coverage_percent must be one of "
+ f"10, 30, 50, 70, got {object_coverage_percent}."
+ )
+
+ return SceneIntakeTable(
+ name=name,
+ description=description,
+ complete_table_description=complete_table_description,
+ is_complete_visible_table=is_complete_visible_table,
+ class_candidate=class_candidate,
+ object_coverage_percent=object_coverage_percent,
+ )
+
+
+def _parse_assets(raw_assets: list[Any]) -> list[SceneIntakeAsset]:
+ assets: list[SceneIntakeAsset] = []
+ seen_names: set[str] = set()
+
+ for asset_index, raw_asset in enumerate(raw_assets):
+ if not isinstance(raw_asset, dict):
+ raise ValueError(f"Scene intake asset {asset_index} must be an object.")
+ _validate_exact_keys(
+ raw_asset,
+ allowed_keys={"name", "description", "class_candidate", "count"},
+ context=f"Scene intake asset {asset_index}",
+ )
+
+ if "name" not in raw_asset:
+ raise ValueError(f"Scene intake asset {asset_index}.name is required.")
+ raw_name = str(raw_asset["name"]).strip()
+ if not raw_name:
+ raise ValueError(
+ f"Scene intake asset {asset_index}.name must be non-empty."
+ )
+
+ if "description" not in raw_asset:
+ raise ValueError(
+ f"Scene intake asset {asset_index}.description is required."
+ )
+ description = str(raw_asset["description"]).strip()
+ if not description:
+ raise ValueError(
+ f"Scene intake asset {asset_index}.description must be non-empty."
+ )
+
+ class_candidate = _parse_class_candidate(
+ raw_asset.get("class_candidate"),
+ asset_index=asset_index,
+ raw_name=raw_name,
+ )
+ count = _parse_count(raw_asset.get("count"), asset_index=asset_index)
+ base_name = normalize_asset_name(raw_name)
+ name = base_name
+ suffix = 2
+ while name in seen_names:
+ name = f"{base_name}_{suffix}"
+ suffix += 1
+ seen_names.add(name)
+ assets.append(
+ SceneIntakeAsset(
+ id=f"interact_{name}",
+ name=name,
+ count=count,
+ description=description,
+ class_candidate=class_candidate,
+ )
+ )
+ return assets
+
+
+def _parse_class_candidate(
+ raw_class_candidate: Any,
+ *,
+ asset_index: int | str,
+ raw_name: str,
+) -> list[str]:
+ if not isinstance(raw_class_candidate, list):
+ raise ValueError(
+ f"Scene intake asset {asset_index}.class_candidate must be a list."
+ )
+ class_candidate = [
+ normalize_asset_name(str(item))
+ for item in raw_class_candidate
+ if normalize_asset_name(str(item))
+ ]
+ expected_name = normalize_asset_name(raw_name)
+ normalized_candidates = [expected_name]
+ for candidate in class_candidate:
+ if candidate != expected_name and candidate not in normalized_candidates:
+ normalized_candidates.append(candidate)
+ generic_fallbacks = [
+ "object",
+ "item",
+ "container",
+ "tableware",
+ "household_object",
+ ]
+ for fallback in generic_fallbacks:
+ if len(normalized_candidates) >= 5:
+ break
+ if fallback != expected_name and fallback not in normalized_candidates:
+ normalized_candidates.append(fallback)
+ if len(normalized_candidates) != 5:
+ raise ValueError(
+ f"Scene intake asset {asset_index}.class_candidate must contain exactly five entries."
+ )
+ if any(not candidate for candidate in normalized_candidates):
+ raise ValueError(
+ f"Scene intake asset {asset_index}.class_candidate has empty entries."
+ )
+ return normalized_candidates
+
+
+def _parse_count(raw_count: Any, *, asset_index: int) -> int:
+ if not isinstance(raw_count, int) or isinstance(raw_count, bool):
+ raise ValueError(f"Scene intake asset {asset_index}.count must be an integer.")
+ if raw_count < 1:
+ raise ValueError(f"Scene intake asset {asset_index}.count must be >= 1.")
+ return raw_count
+
+
+def _validate_exact_keys(
+ value: dict[str, Any],
+ *,
+ allowed_keys: set[str],
+ context: str,
+) -> None:
+ extra_keys = sorted(set(value) - allowed_keys)
+ if extra_keys:
+ log_warning(
+ f"{context} has unexpected keys: {extra_keys}. "
+ f"These fields will be ignored."
+ )
+
+
+def _require_mapping(value: Any, context: str) -> dict[str, Any]:
+ if not isinstance(value, dict):
+ raise ValueError(f"{context} must be an object.")
+ return value
+
+
+def _require_list(value: Any, context: str) -> list[Any]:
+ if not isinstance(value, list):
+ raise ValueError(f"{context} must be a list.")
+ return value
diff --git a/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py b/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py
new file mode 100644
index 000000000..f8d8c2303
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/stage_errors.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__ = ["format_attempt_error", "format_result_missing_error"]
+
+
+def format_attempt_error(stage_name: str, attempt_count: int, exc: Exception) -> str:
+ """Format a retryable stage failure message."""
+ return f"{stage_name} attempt {attempt_count} failed: {exc}"
+
+
+def format_result_missing_error(
+ stage_name: str,
+ result_name: str,
+ *,
+ attempt_count: int,
+ last_error: str | None,
+ errors: list[str],
+) -> str:
+ """Format a missing-final-result error message."""
+ return (
+ f"{stage_name} failed to produce a {result_name} after "
+ f"{attempt_count} attempts. Last error: {last_error}. "
+ f"All retryable errors: {errors}"
+ )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py
new file mode 100644
index 000000000..015c41510
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/__init__.py
@@ -0,0 +1,19 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py
new file mode 100644
index 000000000..ddee20d24
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/graph.py
@@ -0,0 +1,92 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.utils import log
+from embodichain.gen_sim.prompt2scene.workflows.stage_errors import (
+ format_result_missing_error,
+)
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.schema import (
+ UnifiedSceneSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.nodes import (
+ build_unified_scene_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.state import (
+ UnifiedSceneState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["build_unified_scene_graph", "run_unified_scene"]
+
+
+def build_unified_scene_graph() -> Any:
+ """Build the fixed unified-scene assembly workflow."""
+ graph = StateGraph(UnifiedSceneState)
+ graph.add_node("build_unified_scene", build_unified_scene_node)
+ graph.set_entry_point("build_unified_scene")
+ graph.add_edge("build_unified_scene", END)
+ return graph.compile()
+
+
+def run_unified_scene(
+ request: Prompt2SceneInput,
+ *,
+ scene_intake: SceneIntakeSpec,
+ image_relations: ImageRelationSpec | None = None,
+ output_root: Path,
+) -> UnifiedSceneSpec:
+ """Run final unified-scene assembly for one prompt2scene request."""
+ graph = build_unified_scene_graph()
+ result = graph.invoke(
+ {
+ "request": request,
+ "scene_intake": scene_intake,
+ "output_root": output_root,
+ "image_relations": image_relations,
+ "unified_scene": None,
+ "attempt_count": 0,
+ "max_attempts": 1,
+ "last_error": None,
+ "errors": [],
+ }
+ )
+
+ unified_scene = result.get("unified_scene")
+ if unified_scene is not None:
+ return unified_scene
+
+ error = format_result_missing_error(
+ "Unified scene",
+ "UnifiedSceneSpec",
+ attempt_count=result.get("attempt_count", 0),
+ last_error=result.get("last_error"),
+ errors=result.get("errors", []),
+ )
+ log.log_warning(error)
+ raise RuntimeError(error)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py
new file mode 100644
index 000000000..93e561adf
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/nodes.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ UNIFIED_SCENE_STEP,
+ WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.state import (
+ UnifiedSceneState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.utils import (
+ build_unified_scene_from_image_relations,
+)
+
+__all__ = ["build_unified_scene_node"]
+
+
+def build_unified_scene_node(state: UnifiedSceneState) -> dict[str, object]:
+ """Assemble the final unified scene manifest."""
+ scene_intake = state["scene_intake"]
+ image_relations = state.get("image_relations")
+
+ if image_relations is not None and image_relations.status == "ok":
+ unified_scene = build_unified_scene_from_image_relations(
+ scene_intake=scene_intake,
+ image_relations=image_relations,
+ )
+ else:
+ raise ValueError("Unified scene requires image_relations.")
+
+ WorkflowArtifactWriter(
+ state["output_root"],
+ UNIFIED_SCENE_STEP,
+ ).write_step_result(unified_scene.to_manifest())
+ return {"unified_scene": unified_scene}
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py
new file mode 100644
index 000000000..baca2bebe
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/schema.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+__all__ = [
+ "UnifiedObject",
+ "UnifiedSceneSpec",
+ "UnifiedSpatial",
+ "UnifiedSpatialAnchor",
+ "UnifiedSpatialRelation",
+ "UnifiedTable",
+]
+
+
+@dataclass(frozen=True)
+class UnifiedTable:
+ """Unified table/support object."""
+
+ id: str
+ name: str
+ description: str
+ complete_table_description: str
+ is_complete_visible_table: bool
+ class_candidate: list[str]
+ image_path: str | None = None
+ mesh_path: str | None = None
+ grid_cells: dict[str, list[str]] | None = None
+ object_coverage_percent: int | None = None
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the table to JSON-safe data."""
+ manifest: dict[str, Any] = {
+ "id": self.id,
+ "name": self.name,
+ "description": self.description,
+ "complete_table_description": self.complete_table_description,
+ "is_complete_visible_table": self.is_complete_visible_table,
+ "class_candidate": list(self.class_candidate),
+ "image_path": self.image_path,
+ "mesh_path": self.mesh_path,
+ "grid_cells": self.grid_cells,
+ }
+ if self.object_coverage_percent is not None:
+ manifest["object_coverage_percent"] = self.object_coverage_percent
+ return manifest
+
+
+@dataclass(frozen=True)
+class UnifiedObject:
+ """Unified object instance used by downstream scene generation."""
+
+ id: str
+ name: str
+ description: str
+ class_candidate: list[str]
+ grid: str | None = None
+ is_arbitrary_layout: bool = False
+ layout_reason: str = ""
+ image_path: str | None = None
+ mesh_path: str | None = None
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the object to JSON-safe data."""
+ return {
+ "id": self.id,
+ "name": self.name,
+ "description": self.description,
+ "class_candidate": list(self.class_candidate),
+ "grid": self.grid,
+ "is_arbitrary_layout": self.is_arbitrary_layout,
+ "layout_reason": self.layout_reason,
+ "image_path": self.image_path,
+ "mesh_path": self.mesh_path,
+ }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatialAnchor:
+ """Spatial anchor used to infer a full table grid."""
+
+ object_id: str
+ grid: str
+ reason: str = ""
+
+ def to_manifest(self) -> dict[str, str]:
+ """Convert the anchor to JSON-safe data."""
+ return {
+ "object_id": self.object_id,
+ "grid": self.grid,
+ "reason": self.reason,
+ }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatialRelation:
+ """Unified pairwise spatial relation between two objects."""
+
+ subject: str
+ relation: str
+ object: str
+ source: str
+
+ def to_manifest(self) -> dict[str, str]:
+ """Convert the relation to JSON-safe data."""
+ return {
+ "subject": self.subject,
+ "relation": self.relation,
+ "object": self.object,
+ "source": self.source,
+ }
+
+
+@dataclass(frozen=True)
+class UnifiedSpatial:
+ """Unified spatial relations for a scene."""
+
+ anchor: UnifiedSpatialAnchor | None = None
+ relations: list[UnifiedSpatialRelation] = field(default_factory=list)
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the spatial record to JSON-safe data."""
+ return {
+ "anchor": self.anchor.to_manifest() if self.anchor else None,
+ "relations": [relation.to_manifest() for relation in self.relations],
+ }
+
+
+@dataclass(frozen=True)
+class UnifiedSceneSpec:
+ """Unified scene representation consumed by downstream generation steps."""
+
+ input: dict[str, Any]
+ table: UnifiedTable
+ objects: list[UnifiedObject]
+ spatial: UnifiedSpatial
+
+ def to_manifest(self) -> dict[str, Any]:
+ """Convert the unified scene to JSON-safe data."""
+ return {
+ "input": dict(self.input),
+ "table": self.table.to_manifest(),
+ "objects": [obj.to_manifest() for obj in self.objects],
+ "spatial": self.spatial.to_manifest(),
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py
new file mode 100644
index 000000000..1147dbe27
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/state.py
@@ -0,0 +1,41 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeSpec,
+)
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+from embodichain.gen_sim.prompt2scene.workflows.request import Prompt2SceneInput
+
+__all__ = ["UnifiedSceneState"]
+
+
+class UnifiedSceneState(AttemptState):
+ """LangGraph state for unified scene assembly."""
+
+ request: Prompt2SceneInput
+ scene_intake: SceneIntakeSpec
+ output_root: Path
+ image_relations: ImageRelationSpec | None
+ unified_scene: Any | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py
new file mode 100644
index 000000000..55795e3ff
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene/utils.py
@@ -0,0 +1,225 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.image_relations.schema import (
+ ImageAnchor,
+ ImageRelationSpec,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.spatial_relations import (
+ assign_grids_from_anchor_and_orders,
+ derive_relations_from_orders,
+ transitive_relation_closure,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene.schema import (
+ UnifiedObject,
+ UnifiedSceneSpec,
+ UnifiedSpatialAnchor,
+ UnifiedSpatialRelation,
+ UnifiedSpatial,
+ UnifiedTable,
+)
+from embodichain.gen_sim.prompt2scene.workflows.scene_intake.schema import (
+ SceneIntakeAsset,
+ SceneIntakeSpec,
+)
+
+__all__ = [
+ "build_unified_object",
+ "build_unified_object_specs",
+ "build_unified_scene_from_image_relations",
+ "build_unified_spatial_anchor",
+ "build_unified_table",
+ "grid_cells_from_objects",
+ "object_ids_by_name",
+ "resolve_image_layout",
+]
+
+
+def build_unified_object_specs(
+ assets: list[SceneIntakeAsset],
+) -> list[dict[str, Any]]:
+ """Expand scene-intake assets into unified object instance specs."""
+ specs: list[dict[str, Any]] = []
+ for asset in assets:
+ for index in range(asset.count):
+ specs.append(
+ {
+ "id": f"{asset.id}_{index}",
+ "name": asset.name,
+ "description": asset.description,
+ "class_candidate": list(asset.class_candidate),
+ }
+ )
+ return specs
+
+
+def object_ids_by_name(object_specs: list[dict[str, Any]]) -> dict[str, list[str]]:
+ """Group expanded object ids by object name."""
+ grouped: dict[str, list[str]] = defaultdict(list)
+ for spec in object_specs:
+ grouped[str(spec["name"])].append(str(spec["id"]))
+ return dict(grouped)
+
+
+def build_unified_table(
+ scene_intake: SceneIntakeSpec,
+ *,
+ grid_cells: dict[str, list[str]] | None = None,
+) -> dict[str, Any]:
+ """Build a unified table record from scene intake."""
+ table: dict[str, Any] = {
+ "id": scene_intake.table.id,
+ "name": scene_intake.table.name,
+ "description": scene_intake.table.description,
+ "complete_table_description": (
+ scene_intake.table.complete_table_description
+ ),
+ "is_complete_visible_table": scene_intake.table.is_complete_visible_table,
+ "class_candidate": list(scene_intake.table.class_candidate),
+ "image_path": None,
+ "mesh_path": None,
+ "grid_cells": grid_cells,
+ }
+ if scene_intake.table.object_coverage_percent is not None:
+ table["object_coverage_percent"] = (
+ scene_intake.table.object_coverage_percent
+ )
+ return table
+
+
+def build_unified_spatial_anchor(anchor: ImageAnchor | None) -> dict[str, Any] | None:
+ """Convert the image anchor to a unified spatial anchor record."""
+ if anchor is None:
+ return None
+ return {
+ "object_id": anchor.asset_id,
+ "grid": anchor.grid,
+ "reason": anchor.reason,
+ }
+
+
+def build_unified_object(
+ *,
+ spec: dict[str, Any],
+ grid: str | None,
+ is_arbitrary_layout: bool,
+ layout_reason: str,
+) -> dict[str, Any]:
+ """Build one unified object record."""
+ return {
+ "id": spec["id"],
+ "name": spec["name"],
+ "description": spec["description"],
+ "class_candidate": list(spec["class_candidate"]),
+ "grid": grid,
+ "is_arbitrary_layout": is_arbitrary_layout,
+ "layout_reason": layout_reason,
+ "image_path": None,
+ "mesh_path": None,
+ }
+
+
+def resolve_image_layout(
+ asset_id: str,
+ layout_by_id: dict[str, Any],
+) -> tuple[bool, str]:
+ """Resolve an image asset's layout state."""
+ layout = layout_by_id.get(asset_id)
+ if layout is None:
+ return False, ""
+ return bool(layout.is_arbitrary_layout), str(layout.reason)
+
+
+def grid_cells_from_objects(objects: list[dict[str, Any]]) -> dict[str, list[str]] | None:
+ """Build table grid cell membership from unified objects."""
+ grid_cells: dict[str, list[str]] = {
+ "center": [],
+ "front": [],
+ "back": [],
+ "left_center": [],
+ "right_center": [],
+ "left_front": [],
+ "right_front": [],
+ "left_back": [],
+ "right_back": [],
+ }
+ any_grid = False
+ for obj in objects:
+ grid = obj.get("grid")
+ if not grid:
+ continue
+ any_grid = True
+ grid_cells.setdefault(str(grid), []).append(str(obj["id"]))
+ return grid_cells if any_grid else None
+
+
+def build_unified_scene_from_image_relations(
+ *,
+ scene_intake: SceneIntakeSpec,
+ image_relations: ImageRelationSpec,
+) -> UnifiedSceneSpec:
+ """Build a unified scene from image relation outputs."""
+ object_specs = build_unified_object_specs(scene_intake.assets)
+ anchor = build_unified_spatial_anchor(image_relations.anchor)
+ if anchor is None:
+ raise ValueError("Image unified scene requires an anchor.")
+ layout_by_id = {
+ layout.asset_id: layout for layout in image_relations.asset_layouts
+ }
+ objects = []
+ for spec in object_specs:
+ is_arbitrary_layout, layout_reason = resolve_image_layout(
+ spec["id"],
+ layout_by_id,
+ )
+ objects.append(
+ UnifiedObject(
+ **build_unified_object(
+ spec=spec,
+ grid=anchor["grid"] if spec["id"] == anchor["object_id"] else None,
+ is_arbitrary_layout=is_arbitrary_layout,
+ layout_reason=layout_reason,
+ )
+ )
+ )
+ relations = [
+ UnifiedSpatialRelation(**relation)
+ for relation in derive_relations_from_orders(
+ x_order=image_relations.x_order,
+ y_order=image_relations.y_order,
+ )
+ ]
+ return UnifiedSceneSpec(
+ input=scene_intake.input.to_manifest(),
+ table=UnifiedTable(
+ **build_unified_table(
+ scene_intake,
+ grid_cells=grid_cells_from_objects(
+ [object_.to_manifest() for object_ in objects]
+ ),
+ )
+ ),
+ objects=objects,
+ spatial=UnifiedSpatial(
+ anchor=UnifiedSpatialAnchor(**anchor),
+ relations=relations,
+ ),
+ )
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py
new file mode 100644
index 000000000..ac849443e
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/__init__.py
@@ -0,0 +1,27 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.graph import (
+ build_unified_scene_gen_graph,
+ run_unified_scene_gen,
+)
+
+__all__ = [
+ "build_unified_scene_gen_graph",
+ "run_unified_scene_gen",
+]
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py
new file mode 100644
index 000000000..6631240aa
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/graph.py
@@ -0,0 +1,78 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from langgraph.graph import END, StateGraph
+
+from embodichain.gen_sim.prompt2scene.llms import build_chat_model
+from embodichain.gen_sim.prompt2scene.llms.config import OpenAICompatibleLLMCfg
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.nodes import (
+ fit_image_table_to_clutter_node,
+ generate_image_assets_node,
+ load_unified_scene_input_kind_node,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.state import (
+ UnifiedSceneGenState,
+)
+__all__ = [
+ "build_unified_scene_gen_graph",
+ "run_unified_scene_gen",
+]
+
+
+def build_unified_scene_gen_graph() -> Any:
+ """Build the unified-scene generation graph."""
+ graph = StateGraph(UnifiedSceneGenState)
+ graph.add_node("load_unified_scene_input_kind", load_unified_scene_input_kind_node)
+ graph.add_node("generate_image_assets", generate_image_assets_node)
+ graph.add_node("fit_image_table_to_clutter", fit_image_table_to_clutter_node)
+
+ graph.set_entry_point("load_unified_scene_input_kind")
+ graph.add_edge("load_unified_scene_input_kind", "generate_image_assets")
+ graph.add_edge("generate_image_assets", "fit_image_table_to_clutter")
+ graph.add_edge("fit_image_table_to_clutter", END)
+ return graph.compile()
+
+
+def run_unified_scene_gen(
+ output_root: Path,
+ *,
+ unified_scene_result_path: Path | None = None,
+ llm_cfg: OpenAICompatibleLLMCfg | None = None,
+) -> UnifiedSceneGenState:
+ """Run downstream generation routing from a unified-scene result."""
+ llm = build_chat_model(llm_cfg) if llm_cfg is not None else None
+ initial_state: UnifiedSceneGenState = {
+ "output_root": output_root,
+ "unified_scene_result_path": unified_scene_result_path,
+ "llm": llm,
+ "unified_scene": None,
+ "input_kind": None,
+ "table_result": None,
+ "image_object_results": [],
+ "image_objects_layout_result": None,
+ "table_fit_result": None,
+ "generation_status": None,
+ "attempt_count": 0,
+ "max_attempts": 1,
+ "last_error": None,
+ "errors": [],
+ }
+ return build_unified_scene_gen_graph().invoke(initial_state)
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py
new file mode 100644
index 000000000..dced73bdb
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/nodes.py
@@ -0,0 +1,189 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import json
+
+from embodichain.gen_sim.prompt2scene.utils.log import log_info
+from embodichain.gen_sim.prompt2scene.utils.io import write_json
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.state import (
+ UnifiedSceneGenState,
+)
+from embodichain.gen_sim.prompt2scene.workflows.artifact_writer import (
+ UNIFIED_SCENE_GEN_STEP,
+ WorkflowArtifactWriter,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.table_fit_scene import (
+ fit_image_scene_table,
+)
+from embodichain.gen_sim.prompt2scene.agent_tools.tools.image_scene_asset_generation import (
+ generate_image_scene_assets,
+)
+from embodichain.gen_sim.prompt2scene.workflows.paths import (
+ PipelinePaths,
+)
+from embodichain.gen_sim.prompt2scene.workflows.unified_scene_gen.utils import (
+ update_unified_scene,
+)
+
+__all__ = [
+ "fit_image_table_to_clutter_node",
+ "generate_image_assets_node",
+ "load_unified_scene_input_kind_node",
+]
+
+
+def load_unified_scene_input_kind_node(
+ state: UnifiedSceneGenState,
+) -> dict[str, object]:
+ """Load unified-scene output and determine the generation route."""
+ paths = PipelinePaths(state["output_root"])
+ result_path = paths.resolve_scene_result(state["unified_scene_result_path"])
+ if not result_path.is_file():
+ raise FileNotFoundError(f"Unified scene result not found: {result_path}")
+
+ with result_path.open("r", encoding="utf-8") as f:
+ unified_scene = json.load(f)
+ if not isinstance(unified_scene, dict):
+ raise ValueError("Unified scene result must be a JSON object.")
+
+ input_record = unified_scene.get("input")
+ if not isinstance(input_record, dict):
+ raise ValueError("Unified scene result requires input object.")
+
+ input_kind = str(input_record.get("input_kind") or "").strip()
+ if input_kind != "image":
+ raise ValueError(
+ "Unified scene input.input_kind must be 'image', "
+ f"got {input_kind!r}."
+ )
+
+ return {
+ "unified_scene_result_path": result_path,
+ "unified_scene": unified_scene,
+ "input_kind": input_kind,
+ }
+
+def generate_image_assets_node(state: UnifiedSceneGenState) -> dict[str, object]:
+ """Generate table assets and layout-aware object GLBs for image input.
+
+ Table/support and objects are generated in one multi-object call from the
+ original image and existing segmentation masks.
+ """
+ unified_scene = state["unified_scene"]
+ if unified_scene is None:
+ return {"generation_status": "no_unified_scene"}
+
+ paths = PipelinePaths(state["output_root"])
+ output_root = paths.output_root
+ image_gen_dir, glb_gen_dir, debug_dir = paths.prepare_generation_dirs()
+ log_info(
+ "generate_image_assets started "
+ f"output_dir={paths.unified_scene_gen_dir}"
+ )
+
+ segments_path = paths.image_segments_result
+ if not segments_path.is_file():
+ raise FileNotFoundError(
+ f"Image segments result not found: {segments_path}"
+ )
+ with segments_path.open("r", encoding="utf-8") as _f:
+ segments_data = json.load(_f)
+ if not isinstance(segments_data, dict):
+ raise ValueError("Image segments result must be a JSON object.")
+
+ table_spec = unified_scene.get("table") or {}
+ # Image input uses the segmented table/support mask in the multi-object
+ # SAM3D call below. Text table generation belongs to the text branch.
+ object_specs = unified_scene.get("objects") or []
+ object_layout_result = generate_image_scene_assets(
+ object_specs=object_specs,
+ table_spec=table_spec,
+ spatial_relations=(unified_scene.get("spatial") or {}).get("relations", []),
+ segments_data=segments_data,
+ image_gen_dir=image_gen_dir,
+ glb_gen_dir=glb_gen_dir,
+ debug_dir=debug_dir,
+ output_root=output_root,
+ llm=state.get("llm"),
+ )
+ table_result = object_layout_result.get("table") or {
+ "id": str(table_spec.get("id", "table")),
+ "name": str(table_spec.get("name", "table")),
+ "status": "missing_table_generation",
+ }
+ object_results = object_layout_result.get("objects") or []
+ generation_status = str(object_layout_result.get("status", "failed"))
+ if table_result.get("status") != "ok":
+ generation_status = str(table_result.get("status") or generation_status)
+ result_path = paths.resolve_scene_result(state["unified_scene_result_path"])
+ update_unified_scene(unified_scene, table_result, object_results, output_root)
+ write_json(result_path, unified_scene)
+ WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+ {
+ "table": table_result,
+ "objects_layout": object_layout_result,
+ "objects": object_results,
+ "table_fit_to_clutter": None,
+ "generation_status": generation_status,
+ }
+ )
+ log_info(f"generate_image_assets completed status={generation_status}")
+
+ return {
+ "unified_scene": unified_scene,
+ "table_result": table_result,
+ "image_object_results": object_results,
+ "image_objects_layout_result": object_layout_result,
+ "generation_status": generation_status,
+ }
+
+
+def fit_image_table_to_clutter_node(state: UnifiedSceneGenState) -> dict[str, object]:
+ """Resize the final table to fit the aligned image-object clutter."""
+ if state.get("input_kind") != "image":
+ return {}
+
+ paths = PipelinePaths(state["output_root"])
+ output_root = paths.output_root
+ output_dir = paths.table_fit_dir
+ output_dir.mkdir(parents=True, exist_ok=True)
+ log_info(f"fit_image_table_to_clutter started output_dir={output_dir}")
+ layout_result = dict(state.get("image_objects_layout_result") or {})
+ table_fit_result = fit_image_scene_table(
+ layout_result=layout_result,
+ fallback_table_result=state.get("table_result"),
+ output_root=output_root,
+ output_dir=output_dir,
+ )
+ layout_result["table_fit_to_clutter"] = table_fit_result
+ WorkflowArtifactWriter(output_root, UNIFIED_SCENE_GEN_STEP).write_step_result(
+ {
+ "table": state.get("table_result"),
+ "objects_layout": layout_result,
+ "objects": state.get("image_object_results") or [],
+ "table_fit_to_clutter": table_fit_result,
+ "generation_status": state.get("generation_status"),
+ }
+ )
+ log_info(
+ f"fit_image_table_to_clutter completed status={table_fit_result.get('status')}"
+ )
+ return {
+ "image_objects_layout_result": layout_result,
+ "table_fit_result": table_fit_result,
+ }
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py
new file mode 100644
index 000000000..4e051470a
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/state.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.workflows.attempt_state import AttemptState
+
+__all__ = ["UnifiedSceneGenState"]
+
+
+class UnifiedSceneGenState(AttemptState):
+ """LangGraph state for downstream unified-scene generation."""
+
+ output_root: Path
+ unified_scene_result_path: Path | None
+ llm: Any | None
+ unified_scene: dict[str, Any] | None
+ input_kind: str | None
+ table_result: dict[str, Any] | None
+ image_object_results: list[dict[str, Any]]
+ image_objects_layout_result: dict[str, Any] | None
+ table_fit_result: dict[str, Any] | None
+ generation_status: str | None
diff --git a/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/utils.py b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/utils.py
new file mode 100644
index 000000000..2276e559d
--- /dev/null
+++ b/embodichain/gen_sim/prompt2scene/workflows/unified_scene_gen/utils.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2026 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from embodichain.gen_sim.prompt2scene.utils.io import relative_path
+
+__all__ = ["update_unified_scene"]
+
+
+def update_unified_scene(
+ unified_scene: dict[str, Any],
+ table_result: dict[str, Any],
+ object_results: list[dict[str, Any]],
+ output_root: Path,
+) -> None:
+ """Write generated asset references back into a unified-scene payload."""
+ table = unified_scene.setdefault("table", {})
+ metadata_keys = (
+ "table_asset_source",
+ "support_normal_source",
+ "is_complete_visible_table",
+ "complete_table_description",
+ )
+ path_keys = (
+ "image_path",
+ "raw_geometry_path",
+ "support_reference_geometry_path",
+ "generated_table_raw_geometry_path",
+ "transformed_geometry_path",
+ "simready_geometry_path",
+ "aligned_geometry_path",
+ "mesh_path",
+ )
+ for key in metadata_keys:
+ if key in table_result:
+ table[key] = table_result[key]
+ for key in path_keys:
+ if table_result.get(key):
+ table[key] = relative_path(table_result[key], output_root)
+
+ objects_by_id = {
+ str(item.get("id", "")): item
+ for item in unified_scene.setdefault("objects", [])
+ if isinstance(item, dict)
+ }
+ for result in object_results:
+ target = objects_by_id.get(str(result.get("id", "")))
+ if target is None:
+ continue
+ for key in ("image_path", "mesh_path", "aligned_geometry_path"):
+ if result.get(key):
+ target[key] = relative_path(result[key], output_root)
+ metric_scale = result.get("metric_scale")
+ if isinstance(metric_scale, dict):
+ target["metric_scale"] = {
+ key: value
+ for key, value in metric_scale.items()
+ if key not in {"result_path", "raw_model_output_path"}
+ }